In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

Reading the datasets

In [3]:
cab_data = pd.read_csv('../data/Cab_Data.csv')
customers_data = pd.read_csv('../data/Customer_ID.csv')
transactions_data = pd.read_csv('../data/Transaction_ID.csv')
cities_data = pd.read_csv('../data/City.csv')

In [4]:
# replace spaces with underscore
for col in cab_data.columns:
    if ' ' in col:
        cab_data = cab_data.rename(columns={col:col.replace(' ', '_')})
for col in customers_data.columns:
    if ' ' in col:
        customers_data = customers_data.rename(columns={col:col.replace(' ', '_')})
for col in transactions_data.columns:
    if ' ' in col:
        transactions_data = transactions_data.rename(columns={col:col.replace(' ', '_')})

# lowercase all columns
cab_data.columns = map(str.lower, cab_data.columns)
customers_data.columns = map(str.lower, customers_data.columns)
transactions_data.columns = map(str.lower, transactions_data.columns)
cities_data.columns = map(str.lower, cities_data.columns)

# renaming income per month
customers_data = customers_data.rename(columns={'income_(usd/month)':'income'})

In [5]:
cab_data['date_of_travel'] = pd.to_datetime(cab_data['date_of_travel'], format='%d-%m-%Y')
months = []
years =[]
for i in range(len(cab_data['date_of_travel'])):
    months.append(cab_data['date_of_travel'][i].month)
    years.append(cab_data['date_of_travel'][i].year)

cab_data['month'] = months
cab_data['year'] = years

cab_data = cab_data.drop(['date_of_travel'], axis=1)
cab_data.head()

Unnamed: 0,transaction_id,company,city,km_travelled,price_charged,cost_of_trip,month,year
0,10000011,Pink Cab,ATLANTA GA,30.45,370.95,313.635,1,2016
1,10000012,Pink Cab,ATLANTA GA,28.62,358.52,334.854,1,2016
2,10000013,Pink Cab,ATLANTA GA,9.04,125.2,97.632,1,2016
3,10000014,Pink Cab,ATLANTA GA,33.17,377.4,351.602,1,2016
4,10000015,Pink Cab,ATLANTA GA,8.73,114.62,97.776,1,2016


In [6]:
left = cab_data.set_index(['transaction_id'])
right = transactions_data.set_index(['transaction_id'])
data = left.join(right)

left = data.set_index([data.index,'customer_id'])
right = customers_data.set_index(['customer_id'])
data = left.join(right)

left = data.set_index([data.index,'city'])
right = cities_data.set_index(['city'])
data = left.join(right)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,company,km_travelled,price_charged,cost_of_trip,month,year,payment_mode,gender,age,income,population,users
transaction_id,customer_id,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10000011,29290,ATLANTA GA,Pink Cab,30.45,370.95,313.635,1,2016,Card,Male,28,10813,814885,24701
10000012,27703,ATLANTA GA,Pink Cab,28.62,358.52,334.854,1,2016,Card,Male,27,9237,814885,24701
10000013,28712,ATLANTA GA,Pink Cab,9.04,125.2,97.632,1,2016,Cash,Male,53,11242,814885,24701
10000014,28020,ATLANTA GA,Pink Cab,33.17,377.4,351.602,1,2016,Cash,Male,23,23327,814885,24701
10000015,27182,ATLANTA GA,Pink Cab,8.73,114.62,97.776,1,2016,Card,Male,33,8536,814885,24701


In [7]:
pink = data[data['company']=='Pink Cab']
yellow = data[data['company']=='Yellow Cab']

In [8]:
pink = pink.drop(['company', 'payment_mode', 'gender', 'population', 'users'], axis=1)
yellow = yellow.drop(['company', 'payment_mode', 'gender', 'population', 'users'], axis=1)

In [9]:
pink_X = pink.drop(['price_charged'], axis=1)
pink_y = pink['price_charged'].copy()

yellow_X = yellow.drop(['price_charged'], axis=1)
yellow_y = yellow['price_charged'].copy()

Testing LinearRegression model

In [10]:
# since we are going to use multiple models we are splitting them here for easier reading
X_train_pink, X_test_pink, y_train_pink, y_test_pink = train_test_split(pink_X, pink_y, test_size=0.3)
X_train_yellow, X_test_yellow, y_train_yellow, y_test_yellow = train_test_split(yellow_X, yellow_y, test_size=0.3)

In [11]:
lr_pink = LinearRegression().fit(X_train_pink, y_train_pink)

lrp_train_score = lr_pink.predict(X_train_pink)
lrp_train_score = np.sqrt(mean_squared_error(y_train_pink, lrp_train_score))
print('Train score:', lrp_train_score)

lrp_pred = lr_pink.predict(X_test_pink)
lrp_pred = np.sqrt(mean_squared_error(y_test_pink, lrp_pred))
print('Test score:', lrp_pred)

lrp_accuracy = round(lr_pink.score(X_test_pink,y_test_pink), 5)
print('The accuracy is: ', str(lrp_accuracy * 100), '%')

Train score: 67.43173977284934
Test score: 67.44773250560318
The accuracy is:  86.417 %


In [12]:
lr_yellow = LinearRegression().fit(X_train_yellow, y_train_yellow)

lry_train_score = lr_pink.predict(X_train_yellow)
lry_train_score = np.sqrt(mean_squared_error(y_train_yellow, lry_train_score))
print('Train score:', lry_train_score)

lry_pred = lr_yellow.predict(X_test_yellow)
lry_pred = np.sqrt(mean_squared_error(y_test_yellow, lry_pred))
print('Test score:', lry_pred)

lry_accuracy = round(lr_yellow.score(X_test_yellow,y_test_yellow), 5)
print('Accuracy: ', str(lry_accuracy * 100), '%')

Train score: 224.00628670785088
Test score: 145.6174173895393
Accuracy:  74.366 %


Linear Support Vector Regressor

In [13]:
lsvr_pink = LinearSVR()
lsvr_yellow = LinearSVR()

In [14]:
lsvr_pink.fit(X_train_pink, y_train_pink)

lsvrp_train_score = lsvr_pink.predict(X_train_pink)
lsvrp_train_score = np.sqrt(mean_squared_error(y_train_pink, lsvrp_train_score))
print('Train score:', lsvrp_train_score)

lsvrp_pred = lr_pink.predict(X_test_pink)
lsvrp_pred = np.sqrt(mean_squared_error(y_test_pink, lsvrp_pred))
print('Test score:', lsvrp_pred)

lsvrp_accuracy = round(lsvr_pink.score(X_test_pink,y_test_pink), 5)
print('The accuracy is: ', str(lsvrp_accuracy * 100), '%')

Train score: 77.48818187617546
Test score: 67.44773250560318
The accuracy is:  81.933 %


In [15]:
lsvr_yellow.fit(X_train_yellow, y_train_yellow)

lsvry_train_score = lsvr_yellow.predict(X_train_yellow)
lsvry_train_score = np.sqrt(mean_squared_error(y_train_yellow, lsvry_train_score))
print('Train score:', lsvry_train_score)

lsvry_pred = lsvr_yellow.predict(X_test_yellow)
lsvry_pred = np.sqrt(mean_squared_error(y_test_yellow, lsvry_pred))
print('Test score:', lsvry_pred)

lsvry_accuracy = round(lsvr_yellow.score(X_test_yellow,y_test_yellow), 5)
print('Accuracy: ', str(lsvry_accuracy * 100), '%')

Train score: 213.620985897268
Test score: 212.7401733898184
Accuracy:  45.287 %


Dicision Tree Regression

In [16]:
tree_pink = DecisionTreeRegressor(max_depth=7)
tree_yellow = DecisionTreeRegressor(max_depth=7)

In [17]:
tree_pink.fit(X_train_pink, y_train_pink)

tp_train_score = tree_pink.predict(X_train_pink)
tp_train_score = np.sqrt(mean_squared_error(y_train_pink, tp_train_score))
print('Train score:', tp_train_score)

tp_pred = tree_pink.predict(X_test_pink)
tp_pred = np.sqrt(mean_squared_error(y_test_pink, tp_pred))
print('Test score:', tp_pred)

tp_accuracy = round(tree_pink.score(X_test_pink,y_test_pink), 5)
print('Accuracy: ', str(tp_accuracy * 100), '%')

Train score: 65.35036724865327
Test score: 66.1554010345296
Accuracy:  86.932 %


In [18]:
tree_yellow.fit(X_train_yellow, y_train_yellow)

ty_train_score = tree_yellow.predict(X_train_yellow)
ty_train_score = np.sqrt(mean_squared_error(y_train_yellow, ty_train_score))
print('Train score:', ty_train_score)

ty_pred = tree_yellow.predict(X_test_yellow)
ty_pred = np.sqrt(mean_squared_error(y_test_yellow, ty_pred))
print('Test score:', ty_pred)

ty_accuracy = round(tree_yellow.score(X_test_yellow,y_test_yellow), 5)
print('Accuracy: ', str(ty_accuracy * 100), '%')

Train score: 144.01895431969405
Test score: 144.35705627046374
Accuracy:  74.80799999999999 %


Random Forest Regressor

In [19]:
forest_pink = RandomForestRegressor()
forest_yellow = RandomForestRegressor()

In [20]:
forest_pink.fit(X_train_pink, y_train_pink)

fp_train_score = forest_pink.predict(X_train_pink)
fp_train_score = np.sqrt(mean_squared_error(y_train_pink, fp_train_score))
print('Train score:', fp_train_score)

fp_pred = forest_pink.predict(X_test_pink)
fp_pred = np.sqrt(mean_squared_error(y_test_pink, fp_pred))
print('Test score:', fp_pred)

fp_accuracy = round(forest_pink.score(X_test_pink,y_test_pink), 5)
print('Accuracy: ', str(fp_accuracy * 100), '%')

Train score: 25.27042867863134
Test score: 67.62886323334298
Accuracy:  86.344 %


In [21]:
forest_yellow.fit(X_train_yellow, y_train_yellow)

fy_train_score = forest_yellow.predict(X_train_yellow)
fy_train_score = np.sqrt(mean_squared_error(y_train_yellow, fy_train_score))
print('Train score:', fy_train_score)

fy_pred = forest_yellow.predict(X_test_yellow)
fy_pred = np.sqrt(mean_squared_error(y_test_yellow, fy_pred))
print('Test score:', fy_pred)

fy_accuracy = round(forest_yellow.score(X_test_yellow,y_test_yellow), 5)
print('Accuracy: ', str(fy_accuracy * 100), '%')

Train score: 55.428290216011455
Test score: 148.25601980027753
Accuracy:  73.429 %


Grouping scores together

In [22]:
lry = {'Train': round(lry_train_score, 2), 'Test': round(lry_pred, 2)}
lsvry = {'Train': round(lsvry_train_score, 2), 'Test': round(lsvry_pred, 2)}
ty = {'Train': round(ty_train_score, 2), 'Test': round(ty_pred, 2)}
fy = {'Train': round(fy_train_score, 2), 'Test': round(fy_pred, 2)}

lrp = {'Train': round(lrp_train_score, 2), 'Test': round(lrp_pred, 2)}
lsvrp = {'Train': round(lsvrp_train_score, 2), 'Test': round(lsvrp_pred, 2)}
tp = {'Train': round(tp_train_score, 2), 'Test': round(tp_pred, 2)}
fp = {'Train': round(fp_train_score, 2), 'Test': round(fp_pred, 2)}


In [23]:
index=['Linear Regression', 'LinearSVR', 'Decision Tree', 'Random Forrest']
final_scores_yellow = pd.DataFrame([lry, lsvry, ty, fy], index=index)
final_scores_yellow

Unnamed: 0,Train,Test
Linear Regression,224.01,145.62
LinearSVR,213.62,212.74
Decision Tree,144.02,144.36
Random Forrest,55.43,148.26


In [24]:
index=['Linear Regression', 'LinearSVR', 'Decision Tree', 'Random Forrest']
final_scores_pink = pd.DataFrame([lrp, lsvrp, tp, fp], index=index)
final_scores_pink

Unnamed: 0,Train,Test
Linear Regression,67.43,67.45
LinearSVR,77.49,67.45
Decision Tree,65.35,66.16
Random Forrest,25.27,67.63


In [25]:
index=['Linear Regression', 'LinearSVR', 'Decision Tree', 'Random Forrest']
results = pd.DataFrame([lry_accuracy*100, lsvry_accuracy*100, ty_accuracy*100, fy_accuracy*100], index=index, columns=['Accuracy'])
results

Unnamed: 0,Accuracy
Linear Regression,74.366
LinearSVR,45.287
Decision Tree,74.808
Random Forrest,73.429


In [26]:
index=['Linear Regression', 'LinearSVR', 'Decision Tree', 'Random Forrest']
results = pd.DataFrame([lrp_accuracy*100, lsvrp_accuracy*100, tp_accuracy*100, fp_accuracy*100], index=index, columns=['Accuracy'])
results

Unnamed: 0,Accuracy
Linear Regression,86.417
LinearSVR,81.933
Decision Tree,86.932
Random Forrest,86.344
