In [1]:
%store -r preprocessor
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [2]:
#Importing relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [4]:
X_train

Unnamed: 0,country,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...,...,...
23519,Uganda,Rural,Yes,4,48,Female,Head of Household,Divorced/Seperated,No formal education,Other Income
23520,Uganda,Rural,Yes,2,27,Female,Head of Household,Single/Never Married,Secondary education,Other Income
23521,Uganda,Rural,Yes,5,27,Female,Parent,Widowed,Primary education,Other Income
23522,Uganda,Urban,Yes,7,30,Female,Parent,Divorced/Seperated,Secondary education,Self employed


In [5]:
X=X_train

In [6]:
y=y_train

In [7]:
#Separate Numeric and Categorical features
numerical_features=['household_size','age_of_respondent']
categorical_features=['country', 'location_type', 'cellphone_access',
        'gender_of_respondent',
       'relationship_with_head', 'marital_status', 'education_level',
       'job_type']

In [8]:
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
#Preprocessing with Scaling and One-Hot Encoding:
#Create a preprocessing pipeline using ColumnTransformer and Pipeline to scale numerical features and one-hot encode categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [10]:
# Combine preprocessing with the classifier in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [11]:
#Make Predictions:
y_pred = pipeline.predict(X_test)


In [12]:
#Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Additional metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.82
Classification Report:
               precision    recall  f1-score   support

          No       0.88      0.91      0.90      3212
         Yes       0.43      0.35      0.39       607

    accuracy                           0.82      3819
   macro avg       0.66      0.63      0.64      3819
weighted avg       0.81      0.82      0.82      3819

Confusion Matrix:
 [[2927  285]
 [ 392  215]]


In [13]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__max_features': ['auto', 'sqrt', 'log2']
}

# Create RandomizedSearchCV instance
randomized_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the randomized search to the training data
randomized_search.fit(X_train, y_train)

# Get the best parameters
best_params = randomized_search.best_params_
print(f"Best Hyperparameters: {best_params}")


Best Hyperparameters: {'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 10}


In [14]:
#Use the best model from the randomized search to make predictions on your test set
best_model = randomized_search.best_estimator_
pred = best_model.predict(X_test)


In [15]:
#Evaluate the Tuned Model
accuracy = accuracy_score(y_test, pred)
print(f"Tuned Model Accuracy: {accuracy:.2f}")

# Additional metrics
print("Classification Report:\n", classification_report(y_test, pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))


Tuned Model Accuracy: 0.86
Classification Report:
               precision    recall  f1-score   support

          No       0.87      0.98      0.92      3212
         Yes       0.70      0.26      0.38       607

    accuracy                           0.86      3819
   macro avg       0.79      0.62      0.65      3819
weighted avg       0.85      0.86      0.84      3819

Confusion Matrix:
 [[3145   67]
 [ 450  157]]


In [16]:
#testdata
test_data=pd.read_csv('Test.csv')

In [17]:
y_pred = best_model.predict(test_data)

In [18]:
#Submission file
# Combine the columns to form a new column 'combined_column'
test_data['unique_id'] = test_data['uniqueid'].astype(str) + 'x' + test_data['country']

# Display the updated DataFrame with the new combined column
test_data.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type,unique_id
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government,uniqueid_6056xKenya
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private,uniqueid_6060xKenya
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent,uniqueid_6065xKenya
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent,uniqueid_6072xKenya
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent,uniqueid_6073xKenya


In [19]:
# Create submission DataFrame
submission = pd.DataFrame({"uniqueid": test_data["uniqueid"] + " x " + test_data["country"],
                           "bank_account": y_pred})

In [20]:
# Save the DataFrame to a CSV file
submission.to_csv('predicted_values_rf.csv', index=False)