# Random Forest - FABIOLA
1. fit a full random forest model
2. select features from feature importance (remove 3 - 2 lowest and 1 of the correlated features)
3. use CV and some 'arbitrary' chosen hypertuning parameters to choose the best parameters
4. Redo the reduce model with the best tuning parameters and 10 fold
5. get accuracy

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Convert to categorical variables
train_df['AI_Response_Time'] = train_df['AI_Response_Time'].astype('category')
train_df['Customer_Churn'] = train_df['Customer_Churn'].astype('category')
train_df['AI_Interaction_Level'] = train_df['AI_Interaction_Level'].astype('category')
train_df['Change_in_Usage_Patterns'] = train_df['Change_in_Usage_Patterns'].astype('category')
 
test_df['AI_Response_Time'] = test_df['AI_Response_Time'].astype('category')
test_df['Customer_Churn'] = test_df['Customer_Churn'].astype('category')
test_df['AI_Interaction_Level'] = test_df['AI_Interaction_Level'].astype('category')
test_df['Change_in_Usage_Patterns'] = test_df['Change_in_Usage_Patterns'].astype('category')

# Check the updated data types
print("Updated data types in the train dataset:")
print(train_df.dtypes)

Updated data types in the train dataset:
ID                                     int64
Age                                    int64
AI_Interaction_Level                category
Satisfaction_with_AI_Services          int64
AI_Personalization_Effectiveness       int64
AI_Response_Time                    category
Overall_Usage_Frequency                int64
Customer_Service_Interactions          int64
Change_in_Usage_Patterns            category
Customer_Churn                      category
dtype: object


In [4]:
# Split the data into features and target variable
X_train = train_df.drop(columns=['ID', 'Customer_Churn'])
y_train = train_df['Customer_Churn']
X_test = test_df.drop(columns=['ID', 'Customer_Churn'])
y_test = test_df['Customer_Churn']

In [5]:
# FullRandom Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [6]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6552


In [7]:
importances = rf_model.feature_importances_

# Display feature importances
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

                            Feature  Importance
0                               Age    0.209782
5           Overall_Usage_Frequency    0.203119
6     Customer_Service_Interactions    0.151096
2     Satisfaction_with_AI_Services    0.109862
3  AI_Personalization_Effectiveness    0.100789
7          Change_in_Usage_Patterns    0.092083
1              AI_Interaction_Level    0.072057
4                  AI_Response_Time    0.061212


In [8]:
# Based on the feature importances, decide which feature to remove
#  'AI Personalization Effectiveness' has lower importance and highly correlation with 'Satisfaction'
#  'AI_Response_Time' has low importance
# 'AI_Interaction_Level' has lower importance and negatively correlate with 'Age'
X_train_reduced = X_train.drop('AI_Personalization_Effectiveness', axis=1)
X_test_reduced = X_test.drop('AI_Personalization_Effectiveness', axis=1)
X_train_reduced = X_train_reduced.drop('AI_Response_Time', axis=1)
X_test_reduced = X_test_reduced.drop('AI_Response_Time', axis=1)
X_train_reduced = X_train.drop('AI_Interaction_Level', axis=1)
X_test_reduced = X_test.drop('AI_Interaction_Level', axis=1)

# Retrain the model with the reduced feature set
rf_model_reduced = RandomForestClassifier(random_state=42)
rf_model_reduced.fit(X_train_reduced, y_train)

# Evaluate the model with the reduced feature set
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
print("Accuracy with reduced feature set:", accuracy_reduced)


Accuracy with reduced feature set: 0.6592


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_reduced, y_train)

# Print the best parameters and the corresponding accuracy
print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

# Use the best model to make predictions on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the test set:", accuracy)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END m

In [16]:
# Refit the Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

# Train the model on the training data
best_rf_model.fit(X_train_reduced, y_train)

# Make predictions on the test set
y_pred_best_rf = best_rf_model.predict(X_test_reduced)

# Calculate the accuracy on the test set
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print("Accuracy of the best Random Forest model on the test set:", accuracy_best_rf)


Accuracy of the best Random Forest model on the test set: 0.672


## Bagging gives worse result

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Define the Random Forest model with m=p
rf_model_mp = RandomForestClassifier(n_estimators=100, max_features=X_train.shape[1], random_state=42)

# Train the model
rf_model_mp.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_rf_mp = rf_model_mp.predict(X_test)
accuracy_rf_mp = accuracy_score(y_test, y_pred_rf_mp)
print("Accuracy with Random Forest (m=p):", accuracy_rf_mp)


Accuracy with Random Forest (m=p): 0.632
