In [2]:
!pip install numpy pandas scikit-learn

Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/28/75/3b679b41713bb60e2e8f6e2f87be72c971c9e718b1c17b8f8749240ddca8/numpy-1.26.2-cp312-cp312-win_amd64.whl.metadata
  Downloading numpy-1.26.2-cp312-cp312-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.2 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.2 kB 682.7 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.2 kB 660.6 kB/s eta 0:00:01
     -------------------------------------- 61.2/61.2 kB 653.5 kB/s eta 0:00:00
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/df/92/a3fa053c74198f9f0224b2c04dc74f41d2e14e30329c082f7a657f9ca4c5/pandas-2.1.3-cp312-cp312-win_amd64.whl.metadata
  Downloading pandas-2.1.3-cp312-cp312-win_amd64.whl.metadata (18 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonh


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

In [8]:
#read data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [9]:
# Separate features and target variable in training dataset
X_train = train_df.drop('Target', axis=1)
y_train = train_df['Target']

# Separate features and target variable in testing dataset
X_test = test_df.drop('Target', axis=1)
y_test = test_df['Target']

In [10]:
#training the model (100 trees in forest, 42 is the random seed)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [11]:
#make and display predictions
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.9863636363636363
[[49  0  0  1]
 [ 1 56  0  0]
 [ 1  0 68  0]
 [ 0  0  0 44]]
              precision    recall  f1-score   support

           1       0.96      0.98      0.97        50
           2       1.00      0.98      0.99        57
           3       1.00      0.99      0.99        69
           4       0.98      1.00      0.99        44

    accuracy                           0.99       220
   macro avg       0.98      0.99      0.99       220
weighted avg       0.99      0.99      0.99       220



In [12]:
#set up grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],        # Number of trees in the forest
    'max_depth': [10, 20, 30, None],        # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],          # Minimum number of samples required at each leaf node
}

In [13]:
#GridSearchCV trains model for each combination of parameters
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [14]:
#run grid with training data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [15]:
#inspect betst parameters and performances
best_params = grid_search.best_params_
best_grid = grid_search.best_estimator_

print("Best Parameters:", best_params)

predictions = best_grid.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy: 0.9863636363636363
              precision    recall  f1-score   support

           1       0.96      0.98      0.97        50
           2       1.00      0.98      0.99        57
           3       1.00      0.99      0.99        69
           4       0.98      1.00      0.99        44

    accuracy                           0.99       220
   macro avg       0.98      0.99      0.99       220
weighted avg       0.99      0.99      0.99       220

