In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 
from pprint import pp

# Data import en opschonen

In [4]:
train = pd.read_csv("../data/train.csv", delimiter=';')

In [None]:
train.info()

In [6]:
train.head()

Unnamed: 0,target,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q1,r4h1,r4h2,...,computer,television,qmobilephone,lugar1,lugar2,lugar3,lugar4,lugar5,rural,age
0,0,0.0,0,4,0,1,1,0,0,1,...,0,0,3,1,0,0,0,0,1,43
1,0,0.0,0,8,0,1,1,1,0,1,...,0,0,3,0,0,0,0,0,1,18
2,0,0.0,0,5,0,1,1,0,0,2,...,0,1,2,0,0,1,0,0,0,62
3,1,0.0,0,8,0,1,1,2,0,2,...,1,1,4,1,0,0,0,0,0,20
4,0,350000.0,0,5,0,1,1,3,1,1,...,1,0,3,1,0,0,0,0,0,3


In [7]:
train["overcrowding"] = train["overcrowding"].str.replace(",", ".").astype("float")
train["meaneduc"] = train["meaneduc"].str.replace(",", ".").astype("float")

In [8]:
test = train.sample(frac=0.3)
train = train.drop(test.index)

# 1x Random Forest trainen

In [9]:
X = train.drop(columns="target")
Y = train["target"]

In [10]:

clf = RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_leaf=10, n_jobs=-1, ccp_alpha=0.5)
clf = clf.fit(X, Y)

In [11]:
clf.score(test.drop(columns="target"), test["target"])

0.6287085514834206

In [12]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.5,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# Hyper parameter estimation

In [13]:


param_grid = [
    {
        'n_estimators': [120],
        'max_depth': [25],
        'min_samples_leaf': np.append(np.arange(10, 30), 5),
        'bootstrap': [False],
        'n_jobs': [-1],
        'class_weight': [None, 'balanced'],
        'min_impurity_decrease':[1e-3], 
        'min_samples_leaf':[2], 
        'verbose':[0],
    },
]


In [14]:
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1) 

In [15]:
# Training data
X_train = train.drop(columns="target")
Y_train = train["target"]
# Test data
X_test = test.drop(columns="target")
Y_test = test["target"]

In [16]:
# fitting the model for grid search 
grid.fit(X_train, Y_train) 

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [17]:
# print best parameter after tuning 
pp(grid.best_params_) 
grid_predictions = grid.predict(X_test) 

# print classification report 
print("\n" + classification_report(Y_test, grid_predictions)) 

{'bootstrap': False,
 'class_weight': None,
 'max_depth': 25,
 'min_impurity_decrease': 0.001,
 'min_samples_leaf': 2,
 'n_estimators': 120,
 'n_jobs': -1,
 'verbose': 0}

              precision    recall  f1-score   support

           0       0.79      0.89      0.83      1441
           1       0.76      0.59      0.67       851

    accuracy                           0.78      2292
   macro avg       0.77      0.74      0.75      2292
weighted avg       0.78      0.78      0.77      2292

