Importing libraries and data

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#for regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

#for classification
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

#for creating train-test dataset
from sklearn.model_selection import train_test_split


data=pd.read_csv('Lecture7.csv',index_col='Country')

Display all columns and rows

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

Validating regression models

In [None]:
#Revisit lecture 5 example, without train-test split

#USING STATSMODELS
model = ols('Happiness ~ Life_exp',data).fit()
model.summary()

#Generate happiness predictions
data['Predicted_H'] = model.predict(data)

#Evaluate accuracy with visualizations
sns.scatterplot(data, y='Predicted_H', x='Happiness')
plt.plot([min(data['Happiness']), max(data['Happiness'])], [min(data['Happiness']), max(data['Happiness'])], color='red')

#Evaluate accuracy with MAPE
mean_absolute_percentage_error(data['Happiness'], data['Predicted_H'])



#USING SCIKIT-LEARN
y=data[['Happiness']]
x=data[['Life_exp']]
model = LinearRegression()
model.fit(x, y)

#Getting intercept and slope values
model.intercept_
model.coef_

#Generate happiness predictions
data['Predicted_H'] = model.predict(x)

#R2 and MAPE
r2_score(data['Happiness'], data['Predicted_H'])
mean_absolute_percentage_error(data['Happiness'], data['Predicted_H'])


Applying TRAIN-TEST SPLIT

In [None]:
#Define outcome and predictor(s) (same as above)
y=data[['Happiness']]
x=data[['Life_exp']]

#Split x and y into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

len(x_train)
len(x_test)


#USING SCIKIT-LEARN
#Train model on training data
model = LinearRegression()
model.fit(x_train, y_train)
#Getting intercept and slope values
model.intercept_
model.coef_


#Evaluate model with TRAINING DATA
model.predict(x_train)

r2_score(y_train, model.predict(x_train))
mean_absolute_percentage_error(y_train, model.predict(x_train))


#Evaluate model with TESTING DATA
model.predict(x_test)

r2_score(y_test, model.predict(x_test))
mean_absolute_percentage_error(y_test, model.predict(x_test))



#USING STATSMODELS
#recreate two separate dataframes, one of training and one of testing data only
trainset=pd.concat([x_train, y_train], axis=1)  #axis=1 concats along columns, axis=0 concats along index
testset=pd.concat([x_test, y_test], axis=1)


#Train model on training dataset, and evaluate model with TRAINING DATA
model = ols('Happiness ~ Life_exp',trainset).fit()
model.summary()

#Calculate MAPE
model.predict(trainset)

mean_absolute_percentage_error(trainset['Happiness'], model.predict(trainset))


#Evaluate model with TESTING DATA
model.predict(testset)

r2_score(testset['Happiness'], model.predict(testset))
mean_absolute_percentage_error(testset['Happiness'], model.predict(testset))


Validating classification models

In [None]:
#Define outcome and predictor(s)
y = data['Regime']
x = data[['Happiness','GDP_log']]

#Revisit lecture 6 example, without train-test split
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x,y)
knn.score(x, y)

cnf_matrix = metrics.confusion_matrix(data['Regime'], knn.predict(x))

labels = data['Regime'].unique()

sns.heatmap(cnf_matrix, annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')


#Split x and y into train and test sets, with stratify=y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

#Train k-nn model on training data with k=3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)

#Evaluate model with TRAINING DATA
knn.score(x_train,y_train)

cnf_matrix = metrics.confusion_matrix(y_train, knn.predict(x_train))
sns.heatmap(cnf_matrix, annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')



#Evaluate model with TESTING DATA
knn.score(x_test,y_test)

cnf_matrix = metrics.confusion_matrix(y_test, knn.predict(x_test))
sns.heatmap(cnf_matrix, annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')


K-folds cross validation

In [None]:
from sklearn.model_selection import cross_val_score


# Compute 5-fold cross-validation scores for regression model
y=data[['Happiness']]
x=data[['Life_exp']]
model = LinearRegression()

cv_scores = cross_val_score(model,x, y, cv=5, scoring='r2')  #5 folds, specify r2 as metric
np.mean(cv_scores)


# Compute 5-fold cross-validation scores for classification model
y = data['Regime']
x = data[['Happiness','GDP_log']]
knn = KNeighborsClassifier(n_neighbors=3)

cv_scores = cross_val_score(knn, x, y, cv=5, scoring='accuracy')  #5 folds, specify accuracy as metric
np.mean(cv_scores)



Manually RESAMPLING different train-test sets

In [None]:
#For regression
y=data[['Happiness']]
x=data[['Life_exp']]
model = LinearRegression()

r2s =[] #create lists to store the various scores of different combinations
mapes =[]

for i in range (100):
    #remove random_state because we now want a different training set each time
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
    model.fit(x_train,y_train)
    r2s.append(r2_score(y_test,model.predict(x_test)))
    mapes.append(mean_absolute_percentage_error(y_test,model.predict(x_test)))

#compile the two lists into df for further analysis
df = pd.DataFrame({'R2': r2s, 'MAPE': mapes})
df.mean()




#For classification
y = data['Regime']
x = data[['Happiness','GDP_log']]
knn = KNeighborsClassifier(n_neighbors=3)

accuracy=[]

for i in range (100):
    #remove random_state because we now want a different training set each time
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y)
    knn.fit(x_train,y_train)
    accuracy.append(knn.score(x_test,y_test))

#compile the list into df for further analysis
df = pd.DataFrame(accuracy, columns=['accuracy'])
df.mean()


SEMINAR 7

In [None]:
data=pd.read_csv('Lecture7.csv',index_col='Country')
y = data['Regime']
x = data[['Happiness','GDP_log']]


# Setup arrays to store accuracy values
neighbors = np.arange(1, 16)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
average_accuracy = np.empty(len(neighbors))

# Create training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)

# Loop over different values of k, fit model, and compute accuracy
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    train_accuracy[i] = knn.score(x_train,y_train)
    test_accuracy[i] = knn.score(x_test,y_test)
    average_accuracy[i] = (train_accuracy[i] + test_accuracy[i]) / 2

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, train_accuracy, label='train accuracy')
plt.plot(neighbors, test_accuracy, label='test accuracy')
plt.plot(neighbors, average_accuracy, label='avg accuracy')
plt.xticks(neighbors)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
