In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.pipeline import make_pipeline

DATASET_PATH = "datasets/"

def load_song_data(dataset_path=DATASET_PATH):
    df = pd.read_csv(dataset_path + 'year-prediction-msd-train.txt', sep=",", header=None)
    return df

In [2]:
#songs é um DataFrame
songs = load_song_data()

In [3]:
songs_x = songs.iloc[:,1:]
songs_y = songs.iloc[:,0]
#songs_x = songs_x.values.reshape(-1,1)
songs_y = songs_y.values.reshape(-1,1)

In [4]:
#Normalizing
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])
songs_x = num_pipeline.fit_transform(songs_x)

In [5]:
#songs_[x/y]_train must exist because iter_minibatches uses it
songs_x_train=songs_x
songs_y_train=songs_y

In [6]:
def iter_minibatches(chunksize):
    chunkstartmarker = 0
    while chunkstartmarker < len(songs):
        x_chunk = songs_x_train[chunkstartmarker:chunkstartmarker+chunksize]
        y_chunk = songs_y_train[chunkstartmarker:chunkstartmarker+chunksize]
        yield x_chunk, y_chunk
        chunkstartmarker += chunksize

In [7]:
for count, degree in enumerate([2]):
    batcherator = iter_minibatches(chunksize=1000)
    regr = make_pipeline(PolynomialFeatures(degree), SGDRegressor(learning_rate='invscaling', eta0=0.001))
    
    for x_chunk, y_chunk in batcherator:
        if(len(x_chunk)!=0):
            regr.named_steps['sgdregressor'].partial_fit(x_chunk, y_chunk)
    

  y = column_or_1d(y, warn=True)


In [8]:
#Cross validation - kfold strategy
kf = KFold(n_splits=10)
rmse_scores = list()
r2_scores = list()

for train_index, test_index in kf.split(songs_x):
    songs_x_train, songs_x_test = songs_x[train_index], songs_x[test_index]
    songs_y_train, songs_y_test = songs_y[train_index], songs_y[test_index]
    songs_y_pred = regr.named_steps['sgdregressor'].predict(songs_x_test)
    songs_y_pred = songs_y_pred.round()
    rmse_scores.append(np.sqrt(mean_squared_error(songs_y_test, songs_y_pred)))
    r2_scores.append(r2_score(songs_y_test, songs_y_pred))

In [9]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", sum(scores)/float(len(scores)))

In [10]:
display_scores(rmse_scores)
display_scores(r2_scores)

Scores: [9.5349511646217984, 9.5822898870257482, 9.5106810253949501, 9.5797960323300817, 9.615857740717118, 9.6940906795879354, 9.6442802328087325, 9.5612680833400692, 9.6612526565684096, 9.6061066708798943]
Mean: 9.59905741733
Scores: [0.237442090197167, 0.22619372220447709, 0.23295489784442769, 0.22811635250791973, 0.2301010820865379, 0.23016350918021977, 0.23222488217200266, 0.2245539209122106, 0.22743150269987511, 0.23115921195116429]
Mean: 0.230034117176


In [11]:
#The coefficients only for the last cross-validation test

# The coefficients
print('Estimated intercept: ', regr.named_steps['sgdregressor'].intercept_)

# The coefficients
print('Coefficients: ', regr.named_steps['sgdregressor'].coef_)

Estimated intercept:  [ 1998.42084545]
Coefficients:  [  5.17929496e+00  -2.51194633e+00  -1.05400757e+00  -1.15141091e+00
  -1.39175659e-01  -2.16153873e+00   5.12063736e-01  -9.01418507e-01
  -7.90376022e-01   4.16191467e-01   2.00962990e-01  -2.84521661e-01
   9.04021276e-01   6.24972071e-01  -6.04387351e-01   1.86645380e+00
   3.12072395e-01   1.32991332e+00   5.69128416e-01   8.66369250e-01
   5.51418699e-02  -1.40889421e-01  -7.26765786e-03   5.07928036e-01
  -3.88438000e-01  -1.83359633e-02   1.01821482e+00   1.00039102e-01
   2.13730383e-01  -1.02902981e-01  -1.49559470e-01  -4.04740785e-02
  -3.41696255e-01   2.28724584e-01  -6.19257612e-02  -6.18825771e-01
  -6.92948641e-02   2.29983723e-01   4.37431834e-01  -3.53490426e-01
  -1.37616538e-01  -1.18741217e-01  -1.18679960e-01  -8.77841081e-02
  -1.21329848e-01   1.38508132e-01   2.52358794e-01  -7.30602320e-01
   1.06114770e-01   1.73624265e-01   6.75878755e-03  -1.15585376e-01
   1.35198004e-01  -1.45410298e-02   1.50814293e-