# Lesson 5: Assessing Models

Today:
1. Assessing Models
    + Training and Test data
    + Assessing the goodness of a regression model
    + Overfitting, Underfitting

In [None]:
# load libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Read in data
top50 = pd.read_csv('../../../shared/datasets/top50.csv')

top50.head()

In [None]:
top50.shape

In [None]:
# split into test and training
#   training_data = row 0 to row 24
#   test_data = row 25 to row 49

training_data = top50.iloc[ 0:25 , : ]
test_data = top50.iloc[ 25:50 , :]

In [None]:
training_data.corr(numeric_only=True)

In [None]:
# plot
sns.regplot(  data = training_data , x = 'LoudnessdB', y = 'Energy'   )

In [None]:
from sklearn.linear_model import LinearRegression

X_train = training_data[['LoudnessdB']]
y_train = training_data['Energy']


lin_model = LinearRegression().fit( X_train, y_train  )

m = lin_model.coef_
b = lin_model.intercept_

print(m)
print(b)
print(f"Equation of best linear curve is y = {round(m[0], 3)}x + {round(b, 3)}")
# predicted energy = m * loudnessdb + b

In [None]:
# predict y values for the test dataset

test_data.head()

test_data['Energy_predicted'] = test_data['LoudnessdB'] * m + b

# compute prediction error
test_data['Error'] = test_data['Energy'] - test_data['Energy_predicted']
test_data['Error_squared'] = test_data['Error'] ** 2

print('MSE (testing) = ' +str(np.mean(test_data['Error_squared'])))

In [None]:
test_data

In [None]:
# instead of computing MSE, can also compute R^2

X_test = test_data[['LoudnessdB']]
y_test = test_data['Energy']

print('R^2 score (testing) = ' + str(lin_model.score( X_test , y_test)))

### Randomly choosing rows for the training and test data

In [None]:
# set up an array of all of the row indices
row_indices = np.arange(0, 50) # list of numbers from 0 to 49

# randomly select some row indices for the training data
training_row_indices = np.random.choice( row_indices , 25 , replace = False )

# the rest of the row indices are for the test data:
test_row_indices = np.setdiff1d( row_indices , training_row_indices  )

# pick out the rows of the big dataset based on the chosen row indices
training_data = top50.iloc[  training_row_indices  , : ]
test_data = top50.iloc[ test_row_indices , : ]

In [None]:
training_data.shape

In [None]:
test_data.shape

### Bonus: Fitting a polynomial curve using linear regression

In [None]:
# plot
sns.regplot(  data = training_data , x = 'LoudnessdB', y = 'Energy', order = 3  )

In [None]:
# find the equation of the cubic polynomial 

# energy = a * loudness^3 + b * loudness^2 + c * loudness + d
# linear regression: find coefficients a, b, c, d that minimizes the MSE

from sklearn.linear_model import LinearRegression

X_train = training_data[['LoudnessdB']]
y_train = training_data['Energy']


#add loudness squared and cubed columns in the X_train data frame
X_train['LoudnessdB_squared'] = X_train['LoudnessdB'] ** 2
X_train['LoudnessdB_cubed'] = X_train['LoudnessdB'] ** 3
X_train.head()

lin_model = LinearRegression().fit( X_train, y_train  )

m = lin_model.coef_
y_intercept = lin_model.intercept_

# predicted energy = a * loudness^3 + b * loudness^2 + c * loudness + d
print(m)  # the list of a , b, c
print(y_intercept) # the y intercept, aka d in the above equation in the comment

print(f"Equation of best cubic curve is y = {round(m[0], 3)}x^3 + {round(m[1],3)}x^2 + {round(m[2],3)}x + {round(y_intercept,3)}")

In [None]:
X_train.head()

In [None]:
# linear regression, with multiple (independent) variables 

from sklearn.linear_model import LinearRegression

X_train = training_data[['LoudnessdB', 'BeatsPerMinute']]
y_train = training_data['Energy']


lin_model_multiplevars = LinearRegression().fit( X_train, y_train  )

m = lin_model_multiplevars.coef_
z_intercept = lin_model_multiplevars.intercept_

# energy = a * loudnessdb + b * BeatsPerMinute + c
print(m) # the list of a , b
print(z_intercept) # the z intercept, aka c in the above equation in the comment

print(f"Equation of best linear curve is z = {round(m[0], 3)}x + {round(m[1],3)}y + {round(z_intercept,3)}")

In [None]:
top50.head()

In [None]:
from sklearn.model_selection import train_test_split

X = top50[['LoudnessdB', 'BeatsPerMinute']]
Y = top50['Energy']

# Be aware of the order of the items returned
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.5, random_state = 11)


In [None]:
X_train.head()

In [None]:
X_test.head()