In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.pipeline import make_pipeline

DATASET_PATH = "datasets/"

def load_song_data(dataset_path=DATASET_PATH):
    df = pd.read_csv(dataset_path + 'year-prediction-msd-train.txt', sep=",", header=None)
    return df

In [None]:
#songs é um DataFrame
songs = load_song_data()

In [None]:
songs_x = songs.iloc[:,1:]
songs_y = songs.iloc[:,0]
#songs_x = songs_x.values.reshape(-1,1)
songs_y = songs_y.values.reshape(-1,1)

In [None]:
#Normalizing
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])
songs_x = num_pipeline.fit_transform(songs_x)

In [None]:
#songs_[x/y]_train must exist because iter_minibatches uses it
songs_x_train=songs_x
songs_y_train=songs_y

In [None]:
def iter_minibatches(chunksize):
    chunkstartmarker = 0
    while chunkstartmarker < len(songs):
        x_chunk = songs_x_train[chunkstartmarker:chunkstartmarker+chunksize]
        y_chunk = songs_y_train[chunkstartmarker:chunkstartmarker+chunksize]
        yield x_chunk, y_chunk
        chunkstartmarker += chunksize

In [None]:
for count, degree in enumerate([2]):
    batcherator = iter_minibatches(chunksize=1000)
    regr = make_pipeline(PolynomialFeatures(degree), SGDRegressor(learning_rate='invscaling', eta0=0.001))
    
    for x_chunk, y_chunk in batcherator:
        if(len(x_chunk)!=0):
            regr.named_steps['sgdregressor'].partial_fit(x_chunk, y_chunk)
    

In [None]:
#Cross validation - kfold strategy
kf = KFold(n_splits=10, shuffle=True, random_state=1)
rmse_scores = list()
r2_scores = list()

for train_index, test_index in kf.split(songs_x):
    songs_x_train, songs_x_test = songs_x[train_index], songs_x[test_index]
    songs_y_train, songs_y_test = songs_y[train_index], songs_y[test_index]
    songs_y_pred = regr.named_steps['sgdregressor'].predict(songs_x_test)
    songs_y_pred = songs_y_pred.round()
    rmse_scores.append(np.sqrt(mean_squared_error(songs_y_test, songs_y_pred)))
    r2_scores.append(r2_score(songs_y_test, songs_y_pred))

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", sum(scores)/float(len(scores)))

In [None]:
display_scores(rmse_scores)
display_scores(r2_scores)

In [None]:
#The coefficients only for the last cross-validation test

# The coefficients
print('Estimated intercept: ', regr.named_steps['sgdregressor'].intercept_)

# The coefficients
print('Coefficients: ', regr.named_steps['sgdregressor'].coef_)