### Importing Basic Libraries: Numpy and Pandas to handle data

In [1]:
import numpy as np
import pandas as pd

### Read the corresponding data files usind Pandas' read_csv function

In [2]:
weather=pd.read_csv("weather.csv",delimiter=',')
rides=pd.read_csv("cab_rides.csv",delimiter=',')

### Convert the timestamp into the desirable format

In [3]:
rides['date_time'] = pd.to_datetime(rides['time_stamp']/1000, unit='s')

weather['date_time'] = pd.to_datetime(weather['time_stamp'], unit='s')

rides.head()


Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date_time
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890000128
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.676999936
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.197999872
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749000192
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223000064


### Make a coloumn 'merge_date' containing date merged with the location so that we can join the two dataframes on this basis

In [4]:
rides['merge_date'] = rides.source.astype(str) +" - "+ rides.date_time.dt.date.astype("str") +" - "+ rides.date_time.dt.hour.astype("str")
weather['merge_date'] = weather.location.astype(str) +" - "+ weather.date_time.dt.date.astype("str") +" - "+ weather.date_time.dt.hour.astype("str")

### Change the index to merge_date column so joining the two datasets will not generate any error.

In [5]:
weather.index = weather['merge_date']

final_dataframe = rides.join(weather,on=['merge_date'],rsuffix ='_w')

### Drop the null values rows

In [6]:
final_dataframe=final_dataframe.dropna(axis=0)

### Make different columns of day and hour to simplify the format of date 

In [7]:
final_dataframe['day'] = final_dataframe.date_time.dt.dayofweek
final_dataframe['hour'] = final_dataframe.date_time.dt.hour

### We ignored surge value of more than 3 because the samples are very less for surge_multiplier>3

In [8]:
surge_dataframe = final_dataframe[final_dataframe.surge_multiplier < 3]

### feature selection--> we are selecting the most relevant features from the dataset

In [11]:
from sklearn.preprocessing import LabelEncoder

x = surge_dataframe[['distance','day','hour','temp','clouds', 'pressure','humidity', 'wind', 'rain']]

y=surge_dataframe['surge_multiplier']

le = LabelEncoder()

#ignoring multiplier of 3 as there are only 2 values in our dataset

le.fit([1,1.25,1.5,1.75,2.,2.25,2.5])
y = le.transform(y)

### All features to be fed to the ML model

In [13]:
feature_list=list(x.columns)
feature_list

['distance',
 'day',
 'hour',
 'temp',
 'clouds',
 'pressure',
 'humidity',
 'wind',
 'rain']

### Splitting the Dataset into train and test sets in the percentage ratio of 70:30

We have used train_test_split function from scikit-learn library. One can build it from scratch as well. 

In [16]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

### We are using SMOTE to balance out the data of different surge_multipliers

In [18]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)

### Training the Random Forest model
It will take some time, so please wait till the model is ready to be used.

In [20]:
from sklearn.ensemble import RandomForestClassifier

model= RandomForestClassifier(n_jobs=-1, random_state = 42,class_weight="balanced")

model.fit(x_train,y_train)

### Testing the trained Random Forestr model on the test data

In [21]:
y_pred=model.predict(x_test)

### Evaluating the trained model using Confusion Matrix

In [22]:
print(pd.crosstab(le.inverse_transform(y_test), le.inverse_transform(y_pred),rownames=['Actual'],colnames=['Predicted']))

Predicted   1.00  1.25  1.50  1.75  2.00  2.50
Actual                                        
1.00       53717   825   435   164   147     7
1.25         366   580    11     3     7     0
1.50         194     4   200     7     2     0
1.75         101     2     1   129     1     0
2.00          65     7     3     4    99     2
2.50           4     0     0     0     4     1


In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

print("Accuracy Score = ",accuracy_score(y_test,y_pred))              
print("precision score = ",precision_score(y_test, y_pred,average='weighted'))         
print("recall score = ",recall_score(y_test, y_pred,average='micro'))
print("f1 score = ",f1_score(y_test, y_pred, average='weighted'))

errors = abs(y_pred - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Accuracy Score =  0.9585581167238842
precision score =  0.9675988630431089
recall score =  0.9585581167238842
f1 score =  0.9624810336482128
Mean Absolute Error: 0.07 degrees.


### Finding which feature is of what importance

In [25]:
# Get numerical feature importances
importances = list(model.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: distance             Importance: 0.31
Variable: temp                 Importance: 0.13
Variable: pressure             Importance: 0.13
Variable: wind                 Importance: 0.13
Variable: humidity             Importance: 0.1
Variable: rain                 Importance: 0.1
Variable: hour                 Importance: 0.05
Variable: clouds               Importance: 0.04
Variable: day                  Importance: 0.02


[None, None, None, None, None, None, None, None, None]

### It justifies that the Distance feature is the most prominant factor in deciding the surge multiplication factor