In [1]:
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split



In [2]:
data = pd.read_csv("../DataSets/balanced_pop_unpop_features.csv", sep=",")
data['title'] = data['Unnamed: 0']
data = data.drop('Unnamed: 0', axis=1)
data

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,label,title
0,0.21400,0.666,178242,0.677,0.000000,2,0.0979,-5.743,0.0326,100.014,4,0.178,1,#Beautiful
1,0.01340,0.807,183750,0.916,0.000012,0,0.0787,-3.282,0.2260,127.973,4,0.651,1,#SELFIE
2,0.00162,0.791,279507,0.615,0.000065,6,0.0812,-6.149,0.0667,128.017,4,0.393,1,#thatPOWER
3,0.76300,0.707,275227,0.709,0.000000,11,0.2740,-3.979,0.3400,89.094,4,0.501,1,0 To 100 / The Catch Up
4,0.57000,0.629,250173,0.572,0.000000,5,0.1920,-7.733,0.0387,100.015,4,0.386,1,1-800-273-8255
5,0.22600,0.718,222200,0.801,0.000000,0,0.3900,-2.581,0.0386,127.016,4,0.538,1,2012 (It Ain't The End)
6,0.41500,0.635,216983,0.646,0.000550,10,0.4200,-5.246,0.0462,117.956,3,0.344,1,22
7,0.03400,0.818,225983,0.803,0.000000,1,0.1530,-4.282,0.0797,106.970,4,0.632,1,24K Magic
8,0.16800,0.400,281533,0.882,0.000000,7,0.1240,-1.958,0.3340,167.114,4,0.385,1,5 O'Clock
9,0.02460,0.554,182893,0.849,0.000233,1,0.3100,-5.181,0.0357,116.992,4,0.683,1,5-1-5-0


In [3]:
no_title = data.drop(['title', 'label'], axis=1)
z_scores=(no_title - no_title.mean())/no_title.std()
z_scores.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0.234429,0.15328,-1.087287,-0.034082,-0.111864,-0.924928,-0.569818,0.078899,-0.66559,-0.776052,0.110072,-1.499069
1,-0.744047,1.169892,-0.953584,1.390222,-0.111643,-1.467484,-0.704061,1.114347,1.3397,0.212902,0.110072,0.635067
2,-0.801507,1.054532,1.370844,-0.403567,-0.110689,0.160186,-0.686582,-0.091923,-0.31202,0.214459,0.110072,-0.529007
3,2.912313,0.448891,1.26695,0.15662,-0.111864,1.516578,0.661443,0.821089,2.521722,-1.162309,0.110072,-0.04172
4,1.970908,-0.113491,0.658783,-0.659823,-0.111864,-0.111093,0.088113,-0.75838,-0.602341,-0.776016,0.110072,-0.56059


In [4]:
x_train, x_test, y_train, y_test= train_test_split(z_scores, data['label'])

Here we create a regression using every feature available in order to predict if a song will be popular or unpopular

In [5]:
clf1 = LogisticRegression()
clf1.fit(x_train, y_train)
predicted_train = clf1.predict(x_train)
predicted_test = clf1.predict(x_test)
print (clf1.coef_, clf1.intercept_)

[[ 0.07689814  0.16537151  0.05423659 -0.03044576 -0.1153939  -0.02460602
  -0.01865378 -0.02925017  0.04716056  0.09770978  0.05327425 -0.0356278 ]] [-0.05254298]


An initial test of the R^2 values to determine predictive power. We see that the regression was able to explain 54% of the variability around the mean for the training data and 44.7% of the variability for the test data.

In [6]:
print(clf1.score(x_train, y_train))
print(clf1.score(x_test, y_test))

0.5422885572139303
0.44776119402985076


Now, we re-run the regression using the three variables with the highest coefficients in the original model, danceability, instrumentalness, and tempo.

In [8]:
test_2 = z_scores[['danceability', 'instrumentalness', 'tempo']]
x_train2, x_test2, y_train2, y_test2= train_test_split(test_2, data['label'])

In [9]:
clf2 = LogisticRegression()
clf2.fit(x_train2, y_train2)
predicted_train2 = clf2.predict(x_train2)
predicted_test2 = clf2.predict(x_test2)
print (clf2.coef_, clf2.intercept_)

[[ 0.11261441 -0.06368092  0.11978852]] [0.00318371]


Here we see that limiting the number of variables we consider slightly increases our explaination power for the test data to 48.8%

In [10]:
print(clf2.score(x_train2, y_train2))
print(clf2.score(x_test2, y_test2))

0.5410447761194029
0.48880597014925375
