# Polinomial Regression

Cross Validation

Better Features Selection

Stratified Cross validation 

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, Ridge, SGDRegressor
import numpy as np
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectPercentile, f_regression, SelectKBest, mutual_info_regression, RFE, SelectFromModel

DATASET_PATH = "dataset/"

def load_song_data(dataset_path=DATASET_PATH):
    df = pd.read_csv(dataset_path + 'year-prediction-msd-train.txt', sep=",", header=None)
    return df

In [2]:
def iter_minibatches(chunksize, x_train, y_train):
    chunkstartmarker = 0
    while chunkstartmarker < len(x_train):
        x_chunk = x_train[chunkstartmarker:chunkstartmarker+chunksize]
        y_chunk = y_train[chunkstartmarker:chunkstartmarker+chunksize]
        yield x_chunk, y_chunk
        chunkstartmarker += chunksize

In [3]:
def training (chunksize, x_train, y_train):
    for count, degree in enumerate([2]):
        batcherator = iter_minibatches(chunksize=chunksize, x_train=x_train, y_train=y_train)
        regr = make_pipeline(PolynomialFeatures(degree), SGDRegressor(learning_rate='invscaling', eta0=0.001))
    
        for x_chunk, y_chunk in batcherator:
            if(len(x_chunk)!=0):
                regr.named_steps['sgdregressor'].partial_fit(x_chunk, y_chunk)
        return regr

In [4]:
#songs é um DataFrame
songs = load_song_data()

In [5]:
songs_x = songs.iloc[:,1:]
songs_y = songs.iloc[:,0]

In [6]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('skb', SelectKBest(f_regression, k=80)),#Fazer um gráfico variando mean-squared-error | k. Ate agora 80 é o melhor
    #('rfe',RFE(Ridge(), n_features_to_select=30)),
    #('lasso', RandomizedLasso(alpha=1)),
    ('sfm', SelectFromModel(LassoCV(), threshold='1.25*mean'))
])
songs_x = num_pipeline.fit_transform(songs_x, songs_y.ravel())

In [7]:
#Cross validation - kfold strategy
kf = KFold(n_splits=10, shuffle=True, random_state=1) #define n groups
rmse_scores = list()
r2_scores = list()

#for each par (train,test) in slip -> train, test and compute error
#inwhich train has k-1 groups test 1 group
for train_index, test_index in kf.split(songs_x):
    songs_x_train, songs_x_test = songs_x[train_index], songs_x[test_index]
    songs_y_train, songs_y_test = songs_y[train_index], songs_y[test_index]
    
    #train
    #regr=training(1000, songs_x_train,songs_y_train)
    #test
    #songs_y_pred = regr.named_steps['sgdregressor'].predict(songs_x_test)    
    
    regr = make_pipeline(PolynomialFeatures(3), Ridge())
    regr.named_steps['ridge'].fit(songs_x_train, songs_y_train)
    songs_y_pred = regr.named_steps['ridge'].predict(songs_x_test)
    
    songs_y_pred = songs_y_pred.round()
    #compute scores
    rmse_scores.append(np.sqrt(mean_squared_error(songs_y_test, songs_y_pred)))
    r2_scores.append(r2_score(songs_y_test, songs_y_pred))

In [8]:
def display_scores(scores):
    print("Mean:", sum(scores)/float(len(scores)))

In [9]:
display_scores(rmse_scores)
display_scores(r2_scores)

Mean: 9.74169281988
Mean: 0.207007097627


In [10]:
print('Features: ', songs_x_train.shape[1])

#The coefficients only for the last cross-validation test

# The coefficients
print('Estimated intercept: ', regr.named_steps['ridge'].intercept_)

# The coefficients
print('Coefficients: ', regr.named_steps['ridge'].coef_)

Features:  17
Estimated intercept:  1998.37902106
Coefficients:  [ 5.0458941  -2.57738719 -1.3515702  -3.03908911 -0.36878972 -0.59689765
 -0.64139493  1.0106891   0.4297222   0.99732183  0.54931527  1.05710058
  1.45266035  0.99306548 -1.05801698 -0.39695331 -0.68898059]


## Other way to split train e test sets

In [11]:
#Generates 5 categories of year (size:+- 18 years)
n_cat=5
size=(np.max(songs_y)-np.min(songs_y))/n_cat
songs["year_cat"] = np.ceil((songs_y-np.min(songs_y)) / size)

In [12]:
#Distribution of data in categories
songs["year_cat"].value_counts()/len(songs)

5.0    0.770044
4.0    0.172138
3.0    0.052429
2.0    0.004434
1.0    0.000942
0.0    0.000013
Name: year_cat, dtype: float64

In [13]:



#Representative test set (equal percentage of data in each category)
split = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=1)
rmse_scores = list()
r2_scores = list()

for train_index, test_index in split.split(songs_x, songs["year_cat"]):
    strat_songs_x_train, strat_songs_x_test = songs_x[train_index], songs_x[test_index]
    strat_songs_y_train, strat_songs_y_test = songs_y[train_index], songs_y[test_index]
    
    #Train and execute the model
    regr = make_pipeline(PolynomialFeatures(3), Ridge())
    regr.named_steps['ridge'].fit(strat_songs_x_train, strat_songs_y_train)
    songs_y_pred = regr.named_steps['ridge'].predict(strat_songs_x_test)
    songs_y_pred = songs_y_pred.round()

    #compute scores
    rmse_scores.append(np.sqrt(mean_squared_error(strat_songs_y_test, songs_y_pred)))
    r2_scores.append(r2_score(strat_songs_y_test, songs_y_pred))

In [14]:
#Checking if the test set has same distribution in categories compared to the entire dataset (songs)
strat_songs_x_test=np.insert(strat_songs_x_test, 0, np.ceil((strat_songs_y_test-np.min(songs_y)) / size), axis=1)
cat, count = np.unique(strat_songs_x_test[:,0], return_counts=True)
count/len(strat_songs_x_test)

array([  2.15647373e-05,   9.48848443e-04,   4.44233589e-03,
         5.24238765e-02,   1.72129733e-01,   7.70033641e-01])

In [15]:
display_scores(rmse_scores)
display_scores(r2_scores)

Mean: 9.74455536877
Mean: 0.20736910174
