In [22]:
import functions
import models
import pandas as pd
import random
import json

random.seed(2021)

X_train, X_val, X_test, y_train, y_val, y_test = functions.get_data_split_and_standardise()


In [34]:
data = pd.read_csv(
        '../data/simple_df.csv',
        index_col='Country or Area Code'
)
data

Unnamed: 0_level_0,RURAL,URBAN,ALLAREA,Label
Country or Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,81.0,97.0,90,other
4,97.0,100.0,98,hydro
8,100.0,100.0,100,hydro
20,100.0,100.0,100,hydro
28,100.0,100.0,100,combustible
...,...,...,...,...
850,100.0,100.0,100,solar
858,100.0,100.0,100,combustible
860,100.0,100.0,100,combustible
882,99.0,100.0,99,combustible


# Setting a baseline
We will set a baseline performance using a simple model: logistic regression. We will use L2 regularisation. Training and validation scores can be seen in the table below.

In [24]:
lr_l2 = models.LogisticRegressionModel(X_train, X_val, y_train, y_val, penalty='l2')
pd.DataFrame(lr_l2.results)

Unnamed: 0,name,penalty,training score,validation score
0,Logistic Regression,l2,0.648148,0.642857


# Evaluation of different models

Performance of four models evaluated.

In [25]:
names = [
    "Nearest Neighbors", 
    "SVM", 
    "Random Forest", 
    "AdaBoost"
]
classifiers = [
    models.KNeighborsClassifier(),
    models.SVC(),
    models.RandomForestClassifier(),
    models.AdaBoostClassifier()
]
pd.DataFrame(names, columns=["Model"])

Unnamed: 0,Model
0,Nearest Neighbors
1,SVM
2,Random Forest
3,AdaBoost


In [26]:
# create a dictionary of hyperparameters to be used in a grid search
model_dict = {}

model_dict = {
    name: {
        'model': classifier,
        'params': {}
    } for name, classifier in zip(names, classifiers)}

# set the hyperparams for each model
model_dict['Nearest Neighbors']['params'] = {
    'n_neighbors': [1 ,2 ,3 ,5 ,10]
}
model_dict['SVM']['params'] = {
    'kernel': ['linear', 'rbf'],
    'C': [0.05, 0.1, 2, 40]
}
model_dict['Random Forest']['params'] = {
    'max_depth': [5, 7, 10], 
    'n_estimators': [5, 10, 15], 
    'max_features': [1, 2, 3]    
    }
model_dict['AdaBoost']['params'] = {
    'n_estimators': [20, 35, 50], 
    }

grid_search_results = functions.train_all_models(model_dict, X_train, X_val, y_train, y_val)

EVALUATING Nearest Neighbors.
EVALUATING SVM.
EVALUATING Random Forest.
EVALUATING AdaBoost.


# Results of the grid search

In [27]:
results_dict = {}
df = pd.DataFrame(lr_l2.results).drop('penalty', axis=1)

for name in grid_search_results:
    my_dict = json.loads(grid_search_results[name])
    results_dict[name] = my_dict

results_dict
df2 = pd.DataFrame(results_dict).transpose()
pd.concat([df, df2]).set_index('name')

Unnamed: 0_level_0,training score,validation score,n_neighbors,kernel,C,max_depth,n_estimators,max_features
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Logistic Regression,0.648148,0.642857,,,,,,
KNN,0.703704,0.642857,3.0,,,,,
SVM,0.62963,0.714286,,linear,40.0,,,
RandomForest,0.814815,0.642857,,,,5.0,5.0,1.0
AdaBoost,0.648148,0.571429,,,,,20.0,
