In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score

#read the csv file using read_csv

weather=pd.read_csv("N:\Machine learning\Algorithms\weather.csv",delimiter=',')
rides=pd.read_csv("N:\Machine learning\Algorithms\cab_rides.csv",delimiter=',')


#convert the timestamp into the desirable format

rides['date_time'] = pd.to_datetime(rides['time_stamp']/1000, unit='s')
weather['date_time'] = pd.to_datetime(weather['time_stamp'], unit='s')
rides.head()

#make a coloumn of merge date containing date merged with the location so that we can join the two dataframes on the basis of 'merge_date'

rides['merge_date'] = rides.source.astype(str) +" - "+ rides.date_time.dt.date.astype("str") +" - "+ rides.date_time.dt.hour.astype("str")
weather['merge_date'] = weather.location.astype(str) +" - "+ weather.date_time.dt.date.astype("str") +" - "+ weather.date_time.dt.hour.astype("str")


# change the index to merge_date column so joining the two datasets will not generate any error.

weather.index = weather['merge_date']

final_dataframe = rides.join(weather,on=['merge_date'],rsuffix ='_w')

#drop the null values rows

final_dataframe=final_dataframe.dropna(axis=0)

#make different columns of day and hour to simplify the format of date 

final_dataframe['day'] = final_dataframe.date_time.dt.dayofweek
final_dataframe['hour'] = final_dataframe.date_time.dt.hour

# we ignored surge value of more than 3 because the samples are very less for surge_multiplier>3

surge_dataframe = final_dataframe[final_dataframe.surge_multiplier < 3]

# feature selection--> we are selecting the most relevant features from the dataset

x = surge_dataframe[['distance','day','hour','temp','clouds', 'pressure','humidity', 'wind', 'rain']]

y=surge_dataframe['surge_multiplier']

le = LabelEncoder()

#ignoring multiplier of 3 as there are only 2 values in our dataset
le.fit([1,1.25,1.5,1.75,2.,2.25,2.5])
y = le.transform(y) 

feature_list=list(x.columns)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

# we are using smote to balance the data of different surge_multipliers

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)



model= RandomForestClassifier(n_jobs=-1, random_state = 42,class_weight="balanced")

model.fit(x_train,y_train)
y_pred=model.predict(x_test)

# confusion_mat=confusion_matrix(y_test,y_pred,labels=None)
# print("confusion_mat = ",confusion_mat)
print(pd.crosstab(le.inverse_transform(y_test), le.inverse_transform(y_pred),rownames=['Actual'],colnames=['Predicted']))

print("Accuracy Score = ",accuracy_score(y_test,y_pred))              
print("precision score = ",precision_score(y_test, y_pred,average='weighted'))         
print("recall score = ",recall_score(y_test, y_pred,average='micro'))               
from sklearn.metrics import f1_score
print("f1 score = ",f1_score(y_test, y_pred, average='weighted'))
errors = abs(y_pred - y_test)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')


# Get numerical feature importances
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]