# Initial techniques and questions

### Load the `pandas` library

In [5]:
import pandas as pd
pd.__version__

'0.17.1'

### Load the `numpy` library

In [6]:
import numpy as np
np.__version__

'1.10.4'

### Read the dataframe from the `mss_df.pkl` _pickle_ file.

In [11]:
save_load_path = '/Users/David/Desktop'
mss_df = pd.read_pickle(save_load_path+'/mss_df.pkl')

### Display the dimensions (shape) and columns of the dataframe.

In [10]:
print('shape:',mss_df.shape)
print('columns:',mss_df.columns.values)
mss_df.dtypes

shape: (10000, 30)
columns: ['artist_familiarity' 'artist_hotttnesss' 'song_hotttnesss' 'title'
 'artist_location' 'release' 'artist_longitude' 'artist_latitude'
 'duration' 'loudness' 'mode' 'tempo' 'k_0' 'k_1' 'k_2' 'k_3' 'k_4' 'k_5'
 'k_6' 'k_7' 'k_8' 'k_9' 'k_10' 'k_11' 'ts_0' 'ts_1' 'ts_3' 'ts_4' 'ts_5'
 'ts_7']


artist_familiarity    float64
artist_hotttnesss     float64
song_hotttnesss       float64
title                  object
artist_location        object
release                object
artist_longitude      float64
artist_latitude       float64
duration              float64
loudness              float64
mode                  float64
tempo                 float64
k_0                   float64
k_1                   float64
k_2                   float64
k_3                   float64
k_4                   float64
k_5                   float64
k_6                   float64
k_7                   float64
k_8                   float64
k_9                   float64
k_10                  float64
k_11                  float64
ts_0                  float64
ts_1                  float64
ts_3                  float64
ts_4                  float64
ts_5                  float64
ts_7                  float64
dtype: object

### Check for missing values

Then remove variables with missing values below.

In [36]:
mss_df.count()

artist_familiarity     9996
artist_hotttnesss     10000
song_hotttnesss        5648
title                 10000
artist_location       10000
release               10000
artist_longitude       3742
artist_latitude        3742
duration              10000
loudness              10000
mode                  10000
tempo                 10000
k_0                   10000
k_1                   10000
k_2                   10000
k_3                   10000
k_4                   10000
k_5                   10000
k_6                   10000
k_7                   10000
k_8                   10000
k_9                   10000
k_10                  10000
k_11                  10000
ts_0                  10000
ts_1                  10000
ts_3                  10000
ts_4                  10000
ts_5                  10000
ts_7                  10000
dtype: int64

### Import `linear_regression` from `sklearn`

In [7]:
from sklearn import datasets, linear_model

### Create a regression model 

In [13]:
reg_model = linear_model.LinearRegression()

### Function `str_list` is a helper function to create `var_list` below

In [78]:
def str_list(base, numrng): 
    return([base+str(x) for x in numrng])

str_list("ts_",[0,1,3,4,5,7])

['ts_0', 'ts_1', 'ts_3', 'ts_4', 'ts_5', 'ts_7']

In [93]:
target_var = ['artist_hotttnesss']
predictor_vars = \
    str_list("k_",range(12))+ \
    str_list("ts_",[0,1,3,4,5,7])+ \
    ['duration','loudness','mode','tempo']
reg_df = mss_df[target_var+predictor_vars]
print('target:',target_var)
print('predictors:',predictor_vars)
reg_df.head()

target: ['artist_hotttnesss']
predictors: ['k_0', 'k_1', 'k_2', 'k_3', 'k_4', 'k_5', 'k_6', 'k_7', 'k_8', 'k_9', 'k_10', 'k_11', 'ts_0', 'ts_1', 'ts_3', 'ts_4', 'ts_5', 'ts_7', 'duration', 'loudness', 'mode', 'tempo']


Unnamed: 0,artist_hotttnesss,k_0,k_1,k_2,k_3,k_4,k_5,k_6,k_7,k_8,...,ts_0,ts_1,ts_3,ts_4,ts_5,ts_7,duration,loudness,mode,tempo
0,0.401998,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,218.93179,-11.197,0,92.198
1,0.4175,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,148.03546,-9.843,0,121.274
2,0.343428,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,177.47546,-9.689,1,100.07
3,0.454231,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,233.40363,-9.013,1,119.293
4,0.401724,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,209.60608,-4.501,1,129.738


### Split dataframe into train-test

In [84]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(reg_df, test_size = 0.2)
train.shape, test.shape

((8000, 23), (2000, 23))

In [89]:
reg_model.fit(train[predictor_vars],
              train[target_var])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [95]:
# The coefficients
print('Coefficients: \n', reg_model.coef_)

# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((reg_model.predict(test[predictor_vars]) - test[target_var]) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' 
      % reg_model.score(test[predictor_vars], test[target_var]))

Coefficients: 
 [[ -1.19154673e-02   8.53385176e-03   1.67369480e-04  -9.77639251e-03
   -3.64102912e-04   3.87078690e-03   9.80650561e-03  -1.32229173e-03
   -8.75974138e-04  -3.87706952e-03   3.39726157e-04   5.41305823e-03
   -1.18457309e-01   1.83610304e-02   1.69044041e-02   2.97719566e-02
    2.58224016e-02   2.75975166e-02   1.36830180e-05   4.52707549e-03
   -2.45729832e-03   7.14670853e-05]]
Residual sum of squares: 0.02
Variance score: 0.04
