In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

Reading the datasets

In [2]:
cab_data = pd.read_csv('data/Cab_Data.csv')
customers_data = pd.read_csv('data/Customer_ID.csv')
transactions_data = pd.read_csv('data/Transaction_ID.csv')
cities_data = pd.read_csv('data/City.csv')

In [3]:
# replace spaces with underscore
for col in cab_data.columns:
    if ' ' in col:
        cab_data = cab_data.rename(columns={col:col.replace(' ', '_')})
for col in customers_data.columns:
    if ' ' in col:
        customers_data = customers_data.rename(columns={col:col.replace(' ', '_')})
for col in transactions_data.columns:
    if ' ' in col:
        transactions_data = transactions_data.rename(columns={col:col.replace(' ', '_')})

# lowercase all columns
cab_data.columns = map(str.lower, cab_data.columns)
customers_data.columns = map(str.lower, customers_data.columns)
transactions_data.columns = map(str.lower, transactions_data.columns)
cities_data.columns = map(str.lower, cities_data.columns)

# renaming income per month
customers_data = customers_data.rename(columns={'income_(usd/month)':'income'})

In [4]:
cab_data['date_of_travel'] = pd.to_datetime(cab_data['date_of_travel'], format='%d-%m-%Y')
months = []
years =[]
for i in range(len(cab_data['date_of_travel'])):
    months.append(cab_data['date_of_travel'][i].month)
    years.append(cab_data['date_of_travel'][i].year)

cab_data['month'] = months
cab_data['year'] = years

cab_data = cab_data.drop(['date_of_travel'], axis=1)
cab_data.head()

Unnamed: 0,transaction_id,company,city,km_travelled,price_charged,cost_of_trip,month,year
0,10000011,Pink Cab,ATLANTA GA,30.45,370.95,313.635,1,2016
1,10000012,Pink Cab,ATLANTA GA,28.62,358.52,334.854,1,2016
2,10000013,Pink Cab,ATLANTA GA,9.04,125.2,97.632,1,2016
3,10000014,Pink Cab,ATLANTA GA,33.17,377.4,351.602,1,2016
4,10000015,Pink Cab,ATLANTA GA,8.73,114.62,97.776,1,2016


In [5]:
left = cab_data.set_index(['transaction_id'])
right = transactions_data.set_index(['transaction_id'])
data = left.join(right)

left = data.set_index([data.index,'customer_id'])
right = customers_data.set_index(['customer_id'])
data = left.join(right)

left = data.set_index([data.index,'city'])
right = cities_data.set_index(['city'])
data = left.join(right)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,company,km_travelled,price_charged,cost_of_trip,month,year,payment_mode,gender,age,income,population,users
transaction_id,customer_id,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10000011,29290,ATLANTA GA,Pink Cab,30.45,370.95,313.635,1,2016,Card,Male,28,10813,814885,24701
10000012,27703,ATLANTA GA,Pink Cab,28.62,358.52,334.854,1,2016,Card,Male,27,9237,814885,24701
10000013,28712,ATLANTA GA,Pink Cab,9.04,125.2,97.632,1,2016,Cash,Male,53,11242,814885,24701
10000014,28020,ATLANTA GA,Pink Cab,33.17,377.4,351.602,1,2016,Cash,Male,23,23327,814885,24701
10000015,27182,ATLANTA GA,Pink Cab,8.73,114.62,97.776,1,2016,Card,Male,33,8536,814885,24701


In [6]:
pink = data[data['company']=='Pink Cab']
yellow = data[data['company']=='Yellow Cab']

In [7]:
pink = pink.drop(['company', 'payment_mode', 'gender', 'population', 'users'], axis=1)
yellow = yellow.drop(['company', 'payment_mode', 'gender', 'population', 'users'], axis=1)

In [8]:
pink_X = pink.drop(['price_charged'], axis=1)
pink_y = pink['price_charged'].copy()

yellow_X = yellow.drop(['price_charged'], axis=1)
yellow_y = yellow['price_charged'].copy()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(pink_X, pink_y, test_size=0.3)
model = LinearRegression().fit(X_train, y_train)
pred = model.predict(X_test)
print('Test score', np.sqrt(mean_squared_error(y_test, pred)))

Test score 68.34080042636451


In [10]:
X_train, X_test, y_train, y_test = train_test_split(yellow_X, yellow_y, test_size=0.3)
model = LinearRegression().fit(X_train, y_train)
pred = model.predict(X_test)
print('Test score', np.sqrt(mean_squared_error(y_test, pred)))

Test score 145.0750539488303
