In [None]:
#Importing all the packages

from sklearn.metrics import precision_score, recall_score
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

## Loading the data and splitting it into test and train

In [None]:
#importing the clean data into test and train and separating out the target values.
# Also dropping some variable we dont need

train = pd.read_csv(r'C:\Users\eskil\PycharmProjects\tdt4173\Data\clean\train_data_clean', sep = ",")
test = pd.read_csv(r'C:\Users\eskil\PycharmProjects\tdt4173\Data\clean\test_data_clean', sep = ",")

train = train.drop(["artist_followers"], axis = 1)
test = test.drop(["artist_followers"], axis = 1)


x_train = train.drop(["target", "Unnamed: 0"], axis = 1)
y_train = train["target"]

x_test = test.drop(["target", "Unnamed: 0"], axis = 1)
y_test = test["target"]


# Random Forest Model

## Tuning the different parameters
Tuning the hyperparameters and scaling the data used by the cross validation (CV) to avoid any dataleakage when scaling the data. Not using the OOB-score considering because it uses only a subset of the DTs in the random forest not used when training and we also have enough data to set aside a considerably large validationset when using cross-validation.

In [None]:
#Tuning the n_estimators-parameter to find where it starts to drop off.
n_estimators = [1, 2, 5, 10, 50, 100, 150, 200, 300, 400, 500]
max_depth = [20]
param_grid = {"n_estimators" : n_estimators, "max_depth" : max_depth}
estimator = RandomForestClassifier()
clf = GridSearchCV(estimator = estimator, param_grid = param_grid, n_jobs=-1, cv = 5)
clf.fit(x_train,y_train)
allscores=clf.cv_results_['mean_test_score']
print(allscores)

In [None]:
#Tuning the max_depth parameter.
n_estimators = [150]
max_depth = [1,2,5,10,12,15,17,20,25,30,40, None]
param_grid = {"n_estimators" : n_estimators, "max_depth" : max_depth}
estimator = RandomForestClassifier()
clf = GridSearchCV(estimator = estimator, param_grid = param_grid, n_jobs=-1, cv = 5)
clf.fit(x_train,y_train)
allscores=clf.cv_results_['mean_test_score']
print(allscores)

In [None]:
x_train.head()

In [None]:
# Proving how our model starts to overfit the trainingdata severely for large depth values,
#without any increase in generalization/accuracy on the "test data"
#for this we don't use cross validation, considering the increasing time complexity and that its mainly for visualisation purposes
train_depth, evaluation_depth = train_test_split(train, test_size = 0.2, random_state = 4)

x_train_depth = train_depth.drop(["target", "Unnamed: 0"], axis = 1)
y_train_depth = train_depth["target"]

x_eval_depth = evaluation_depth.drop(["target", "Unnamed: 0"], axis = 1)
y_eval_depth = evaluation_depth["target"]

max_depths = np.linspace(1, 25, 25, endpoint=True)
train_results = []
eval_results = []

for max_depth in max_depths:
    rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1, n_estimators = 150)
    rf.fit(x_train_depth, y_train_depth)
    
    acc_rf = round(rf.score(x_train_depth, y_train_depth) * 100, 2)
    train_results.append(acc_rf)

    acc_rf = round(rf.score(x_eval_depth, y_eval_depth) * 100, 2)
    eval_results.append(acc_rf)
    
from matplotlib.legend_handler import HandlerLine2D
print(train_results)
print(eval_results)
plt.plot(max_depths, train_results)
plt.plot(max_depths, eval_results)
plt.grid()

plt.title("Accuracy train/test for different depth")
plt.xlabel("depth")
plt.ylabel("accuracy")

In [None]:
# Setting all the parameters to be tested as well as initialize the classifier to be used.
clf = RandomForestClassifier()
min_samples_split = [2, 4, 6, 8, 12, 16, 20]
min_samples_leaf = [1, 3, 5, 9, 15, 27]
max_features = ["sqrt", 0.5, 0.7]
n_estimators = [150]
max_depth = [15]
criterion = ["gini"]

#Making a grid of all parameters and use the gridsearchCV to crossvalidate ahe models using all combinations inside the grid
# Printing the params yielding the best accuracy.
param_grid = {"criterion" : criterion, "n_estimators" : n_estimators,
              "max_depth" : max_depth, "min_samples_split" : min_samples_split,
              "min_samples_leaf" : min_samples_leaf, "max_features" : max_features}
clf = GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs=-1, cv = 5)
clf.fit(x_train, y_train)
clf.best_params_

## Testing the model with the best parameters, printing the accuracies and plotting the learning curve

In [None]:
#Testing the model on the hold off test set and printing accuracy, recall and precision as well as the confusion matrix.
#rf = RandomForestClassifier(criterion = "gini", n_estimators = 150, max_depth = 15, max_features = 0.5, min_samples_leaf = 1, min_samples_split = 4)
#rf.fit(x_train, y_train)
predictions = clf.best_estimator_.predict(x_test)
acc_rf = round(clf.best_estimator_.score(x_test, y_test) * 100, 2)
prec_rf = precision_score(y_test, predictions) * 100
recall_rf = recall_score(y_test, predictions) * 100

print("accuracy:", acc_rf)
print("Precision:", prec_rf)
print("Recall:", recall_rf)
confusion_matrix(predictions, y_test)

In [None]:
# Preperation for finding the learning curve
train_sizes, train_scores, test_scores = learning_curve(rf, x_train, y_train, cv = 5, scoring = "accuracy", train_sizes = np.linspace(0.01,1,30), verbose = 1)

train_mean = np.mean(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)

#Plotting the learning curve
plt.plot(train_sizes, train_mean, label = 'training score')
plt.plot(train_sizes, test_mean, label = 'cross-validation score')
plt.grid()

plt.title("Learning Curve")
plt.xlabel("samples")
plt.ylabel("accuracy")

## Importances of the different features

In [None]:
# Plotting the importance of each features for the random forest model
importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(rf.feature_importances_,4)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)
importances.plot.bar()