In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.pipeline import make_pipeline

DATASET_PATH = "datasets/"

def load_song_data(dataset_path=DATASET_PATH):
    df = pd.read_csv(dataset_path + 'year-prediction-msd-train.txt', sep=",", header=None)
    return df

In [2]:
#songs é um DataFrame
songs = load_song_data()

In [3]:
songs_x = songs.iloc[:,1:]
songs_y = songs.iloc[:,0]
songs_y.shape

(463715,)

In [4]:
#Normalizing
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])
songs_x = num_pipeline.fit_transform(songs_x)

In [5]:
def iter_minibatches(chunksize, x_train, y_train):
    chunkstartmarker = 0
    while chunkstartmarker < len(x_train):
        x_chunk = x_train[chunkstartmarker:chunkstartmarker+chunksize]
        y_chunk = y_train[chunkstartmarker:chunkstartmarker+chunksize]
        yield x_chunk, y_chunk
        chunkstartmarker += chunksize

In [6]:
def training (chunksize, x_train, y_train):
    for count, degree in enumerate([2]):
        batcherator = iter_minibatches(chunksize=chunksize, x_train=x_train, y_train=y_train)
        regr = make_pipeline(PolynomialFeatures(degree), SGDRegressor(learning_rate='invscaling', eta0=0.001))
    
        for x_chunk, y_chunk in batcherator:
            if(len(x_chunk)!=0):
                regr.named_steps['sgdregressor'].partial_fit(x_chunk, y_chunk)
        return regr
        
    

In [7]:
#Cross validation - kfold strategy
kf = KFold(n_splits=10, shuffle=True, random_state=1) #define n groups
rmse_scores = list()
r2_scores = list()

#for each par (train,test) in slip -> train, test and compute error
#inwhich train has k-1 groups test 1 group
for train_index, test_index in kf.split(songs_x):    
    songs_x_train, songs_x_test = songs_x[train_index], songs_x[test_index]
    songs_y_train, songs_y_test = songs_y[train_index], songs_y[test_index]
    #train
    regr=training(1000, songs_x_train,songs_y_train)
    #test
    songs_y_pred = regr.named_steps['sgdregressor'].predict(songs_x_test)    
    songs_y_pred = songs_y_pred.round()
    #compute scores
    rmse_scores.append(np.sqrt(mean_squared_error(songs_y_test, songs_y_pred)))
    r2_scores.append(r2_score(songs_y_test, songs_y_pred))



In [8]:
def display_scores(scores):
    print("Mean:", sum(scores)/float(len(scores)))

In [9]:
display_scores(rmse_scores)
display_scores(r2_scores)

Mean: 9.60657606527
Mean: 0.228843703702


In [10]:
#The coefficients only for the last cross-validation test

# The coefficients
print('Estimated intercept: ', regr.named_steps['sgdregressor'].intercept_)

# The coefficients
print('Coefficients: ', regr.named_steps['sgdregressor'].coef_)

Estimated intercept:  [ 1998.39216851]
Coefficients:  [ 5.13044796 -2.43290429 -0.69773119 -1.66838709 -0.15612549 -2.15285702
  0.62366109 -0.97849586 -0.80197153  0.4926904   0.34584935 -0.33268395
  0.83870647  0.62828435 -0.55494851  2.01173906  0.32785762  1.87469933
  0.58619204  0.92854858  0.12184831 -0.3932338  -0.27453474  0.53204128
 -0.37555087 -0.04647131  0.99136711  0.10349206  0.23484538 -0.13918802
 -0.18837929 -0.01614501 -0.42903141  0.20993216 -0.10555413 -0.66208901
 -0.04874211  0.1977199   0.50500887 -0.3377201  -0.13854317 -0.14195695
 -0.09802609 -0.06500913 -0.10667797  0.10127259  0.25103798 -0.81780886
  0.13155618  0.15450145 -0.02810552 -0.11633315  0.14879585 -0.04700927
  0.17848514 -0.15367397 -0.79370924  0.34433202 -0.29732881  0.02392038
 -0.16655401 -0.02340273  0.17388462  0.36202334 -0.52921118  0.09899448
  0.03947245 -0.05372267 -0.58396044  0.03061935 -0.13246243 -0.03220758
  0.35245236  0.32672819  0.05983978  0.35914809  0.05627796 -0.632474