In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import joblib

In [91]:
df = pd.read_csv("Travel.csv")
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [92]:
df['ProdTaken'].value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [93]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [94]:
df['ProductPitched'].value_counts()

ProductPitched
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: count, dtype: int64

In [95]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [96]:
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':'Unmarried'})

In [97]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [98]:
df['Gender'] = df['Gender'].replace({'Fe Male':'Female'})

In [99]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [100]:
df.drop(columns=['CustomerID'],inplace=True)

In [101]:
missing = df.isnull().sum()
print(missing[missing > 0])

Age                         226
TypeofContact                25
DurationOfPitch             251
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64


In [102]:
## Filling median values with null values in the columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)

In [103]:
## Filling mode values in categorical Columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)

In [104]:
missing = df.isnull().sum()
print(missing[missing > 0])

Series([], dtype: int64)


In [105]:
## one hot encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True, dtype=int)

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ProdTaken                    4888 non-null   int64  
 1   Age                          4888 non-null   float64
 2   CityTier                     4888 non-null   int64  
 3   DurationOfPitch              4888 non-null   float64
 4   NumberOfPersonVisiting       4888 non-null   int64  
 5   NumberOfFollowups            4888 non-null   float64
 6   PreferredPropertyStar        4888 non-null   float64
 7   NumberOfTrips                4888 non-null   float64
 8   Passport                     4888 non-null   int64  
 9   PitchSatisfactionScore       4888 non-null   int64  
 10  OwnCar                       4888 non-null   int64  
 11  NumberOfChildrenVisiting     4888 non-null   float64
 12  MonthlyIncome                4888 non-null   float64
 13  TypeofContact_Self

In [107]:
X = df.drop('ProdTaken', axis=1)
y = df['ProdTaken']


In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2,stratify=y)

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
models = {
    'Random Classification': RandomForestClassifier(random_state=42),
}

In [111]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

rs = RandomizedSearchCV(rf, param_distributions=param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, random_state=42)
rs.fit(X_train, y_train)


In [112]:
from sklearn.metrics import accuracy_score, classification_report
for name,model in models.items():
    print(name)
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
    print("Test  Accuracy:", accuracy_score(y_test, y_test_pred))

    print("\n Classification Report:")
    print(classification_report(y_test, y_test_pred))
    print("----------------------------------------------------------")

    joblib.dump(rs.best_estimator_, 'rf_model.pkl')

    # Save the features
    import json
    with open('model_features.json', 'w') as f:
        json.dump(X_train.columns.tolist(), f)


Random Classification
Train Accuracy: 1.0
Test  Accuracy: 0.9233128834355828

 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       794
           1       0.95      0.62      0.75       184

    accuracy                           0.92       978
   macro avg       0.93      0.81      0.85       978
weighted avg       0.93      0.92      0.92       978

----------------------------------------------------------


In [113]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64