# Implementation of Bagging using Random Forest

## Holiday Package Prediction

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [55]:
df = pd.read_csv('Travel.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Travel.csv'

## Data Cleaning

### Handling Missing Values

1. Handling Missing Values.
2. Handling Duplicates.
3. Check data type.
4. Understand the dataset.

In [None]:
## Checking for missing values
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [None]:
## Check all categorties
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [None]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [None]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried')

In [None]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [None]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [None]:
## Check Missing Values
### These are the features with nan value

features_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean() * 100, 5),  ' % missing values')

Age 4.62357  % missing values
TypeofContact 0.51146  % missing values
DurationOfPitch 5.13502  % missing values
NumberOfFollowups 0.92062  % missing values
PreferredPropertyStar 0.53191  % missing values
NumberOfTrips 2.86416  % missing values
NumberOfChildrenVisiting 1.35025  % missing values
MonthlyIncome 4.76678  % missing values


In [None]:
# Statistics on numerical columns (Null cols)
df[features_with_na].select_dtypes(exclude='object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


## Imputing Null Values

1. Impute Median value for Age column.
2. Impute Mode for Type of Contract.
3. Impute Median for Duration of Pitch.
4. Impute Mode for NumberofFollowup as it is Discreate feature.
5. Impute Mode for PreferredPropertyStart.
6. Impute Median for NumberofTrips.
7. Impute Mode for NumberOfChildrenVisiting.
8. Impute Median for MonthlyIncome.

In [None]:
# Age
df.Age.fillna(df.Age.median(), inplace=True)

# TypeofContact
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)

# DurationOfPitch
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)

# NumberOfFollowups
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)

# PreferredPropertyStart
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)

# NumberOfTrips
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)

# NumberOfChildrenVisiting
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode, inplace=True)

# MonthlyIncome
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

In [None]:
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean() * 100, 5),  ' % missing values')

Age 0.0  % missing values
TypeofContact 0.0  % missing values
DurationOfPitch 0.0  % missing values
NumberOfFollowups 0.0  % missing values
PreferredPropertyStar 0.0  % missing values
NumberOfTrips 0.0  % missing values
NumberOfChildrenVisiting 0.0  % missing values
MonthlyIncome 0.0  % missing values
