# Problem Statement

You work for an office transport company. You are in discussions with ABC Consulting company for 
providing transport for their employees. For this purpose, you are tasked with understanding how do the 
employees of ABC Consulting prefer to commute presently (between home and office). Based on the 
parameters like age, salary, work experience etc. given in the data set ‘Transport.csv’, you are required to 
predict the preferred mode of transport. The project requires you to build several Machine Learning 
models and compare them so that the model can be finalised.

Data Dictionary
Age : Age of the Employee in Years
Gender : Gender of the Employee
Engineer : For Engineer =1 , Non Engineer =0
MBA : For MBA =1 , Non MBA =0
Work Exp : Experience in years
Salary : Salary in Lakhs per Annum
Distance : Distance in Kms from Home to Office
license : If Employee has Driving Licence -1, If not, then 0
Transport : Mode of Transport
The objective is to build various Machine Learning models on this data set and based on the accuracy 
metrics decide which model is to be finalised for finally predicting the mode of transport chosen by the 
employee.

In [None]:
# !pip install imblearn

# To enable plotting graphs in Jupyter notebook
%matplotlib inline

In [None]:
#Importing all the required libraries
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt   
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.style
plt.style.use('classic')

In [None]:
#Reading the data
o1 = pd.read_csv('Cars (2).csv')

In [None]:
o1

In [None]:
o1.head(5)

In [None]:
o1.Transport.unique()

In [None]:
o1.tail(5)

In [None]:
o1.dtypes

In [None]:
o1.info()

In [None]:
o1.shape

In [None]:
o1.describe().T

In [None]:
o1.columns

# Univariate Analysis

## Evaluating Age

In [None]:
o1['Age'].unique()
o1['Age'].value_counts()

In [None]:
sns.distplot(o1['Age'],kde=False)

In [None]:
sns.boxplot(o1['Age'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))

#distplot
sns.distplot(o1['Age'],ax=ax2)
ax2.set_xlabel('Age', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['Age'])
ax3.set_xlabel('Age', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['Gender'].unique()
o1['Gender'].value_counts()

In [None]:
o1['Engineer'].unique()
o1['Engineer'].value_counts()

In [None]:
sns.distplot(o1['Engineer'],kde=False)

In [None]:
sns.boxplot(o1['Engineer'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['Engineer'],ax=ax2)
ax2.set_xlabel('Engineer', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['Engineer'])
ax3.set_xlabel('Engineer', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['MBA'].unique()
o1['MBA'].value_counts()

In [None]:
sns.distplot(o1['MBA'],kde=False)

In [None]:
sns.boxplot(o1['MBA'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['MBA'],ax=ax2)
ax2.set_xlabel('MBA', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['MBA'])
ax3.set_xlabel('MBA', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['Work Exp'].unique()
o1['Work Exp'].value_counts()

In [None]:
sns.distplot(o1['Work Exp'],kde=False)

In [None]:
sns.boxplot(o1['Work Exp'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['Work Exp'],ax=ax2)
ax2.set_xlabel('Work Exp', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['Work Exp'])
ax3.set_xlabel('Work Exp', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['Salary'].unique()
o1['Salary'].value_counts()

In [None]:
sns.distplot(o1['Salary'],kde=False)

In [None]:
sns.boxplot(o1['Salary'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['Salary'],ax=ax2)
ax2.set_xlabel('Salary', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['Salary'])
ax3.set_xlabel('Salary', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['Distance'].unique()
o1['Distance'].value_counts()

In [None]:
sns.distplot(o1['Distance'],kde=False)

In [None]:
sns.boxplot(o1['Distance'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['Distance'],ax=ax2)
ax2.set_xlabel('Distance', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['Distance'])
ax3.set_xlabel('Distance', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['license'].unique()
o1['license'].value_counts()

In [None]:
sns.distplot(o1['license'],kde=False)

In [None]:
sns.boxplot(o1['license'])

In [None]:
fig, (ax2,ax3)=plt.subplots(1,2,figsize=(13,5))
#distplot
sns.distplot(o1['license'],ax=ax2)
ax2.set_xlabel('license', fontsize=15)
ax2.tick_params(labelsize=15)

#histogram
ax3.hist(o1['license'])
ax3.set_xlabel('license', fontsize=15)
ax3.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.5)
plt.tight_layout()

In [None]:
o1['Transport'].unique()
o1['Transport'].value_counts()

In [None]:
plt.figure(figsize=(15,15))
o1[['Age','Gender','Engineer','MBA','Work Exp','Salary','Distance','license','Transport']].boxplot(vert=0)

# Multivariate analysis Checking pair plot for continuous variables

In [None]:
sns.pairplot(o1[['Age','Gender','Engineer','MBA','Work Exp','Salary','Distance','license','Transport']])

In [None]:
# construct heatmap with only continuous variables
plt.figure(figsize=(10,8))
sns.set(font_scale=1.2)
sns.heatmap(o1[['Age','Gender','Engineer','MBA','Work Exp','Salary','Distance','license','Transport']].corr(), annot=True)

In [None]:
#Confirming the number of null values
o1.isnull().sum()

#Pairplot
v = o1.iloc[:, 0:10]
sns.pairplot(v,diag_kind='kde')

## unique values for categorical variables

In [None]:
for column in o1.columns:
    if o1[column].dtype == 'object':
        print(column.upper(),': ',o1[column].nunique())
        print(o1[column].value_counts().sort_values())
        print('\n')

In [None]:
o1.isnull().sum()

In [None]:
dups = o1.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))

o1[dups]

In [None]:
## Let's rename columns to remove space before we impute
o1.rename(columns = {"Work Exp": "Work_Exp"},inplace=True)

# Outlier Checks

In [None]:
# construct box plot for continuous variables
plt.figure(figsize=(10,10))
o1.iloc[:,:7].boxplot(vert=0)
plt.show()

# outlier treatment

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
for column in o1.iloc[:, 1:3].columns:
    lr,ur=remove_outlier(o1[column])
    o1[column]=np.where(o1[column]>ur,ur,o1[column])
    o1[column]=np.where(o1[column]<lr,lr,o1[column])