In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
weather = pd.read_csv('weather.csv')
rideshare = pd.read_csv('rideshare_kaggle.csv')
rides = pd.read_csv('cab_rides.csv')

In [4]:
rides.isnull().sum()

distance                0
cab_type                0
time_stamp              0
destination             0
source                  0
price               55095
surge_multiplier        0
id                      0
product_id              0
name                    0
dtype: int64

In [5]:
rides.dropna(axis = 0, inplace = True)

In [6]:
weather.isnull().sum()

temp             0
location         0
clouds           0
pressure         0
rain          5382
time_stamp       0
humidity         0
wind             0
dtype: int64

In [7]:
weather.fillna(0, inplace = True)

In [8]:
weather_avg = weather.groupby('location').mean().reset_index()
weather_avg.drop(columns = 'time_stamp' , inplace = True)
weather_source = weather_avg.rename(columns =(lambda x :x + '_source'))       
weather_source.rename(columns= {'location_source' :'source'}, inplace = True)
weather_destination = weather_avg.rename(columns =(lambda x :x+'_destination'))       
weather_destination.rename(columns= {'location_destination' :'destination'} ,inplace = True)
data = rides\
       .merge(weather_source ,on ='source')\
       .merge(weather_destination ,on = 'destination')
data = data.drop('id'  , axis =1)

#binary encoding to cab_type

data['cab_type'] =data['cab_type'].replace({'Uber':1,'Lyft':0})

def one_hot_encode(df , column , prefix):
    dummy = pd.get_dummies(df[column] ,prefix = prefix)
    df = pd.concat([df , dummy] ,axis =1)
    df =df.drop(column , axis =1)
    
    return df

data = one_hot_encode(data ,column =  'destination' , prefix = 'desti')
data = one_hot_encode(data ,column =  'source' , prefix = 'src')
data = one_hot_encode(data ,column =  'product_id' , prefix = 'pid')
data = one_hot_encode(data ,column =  'name' , prefix = 'nm')

In [10]:
data.columns

Index(['distance', 'cab_type', 'time_stamp', 'price', 'surge_multiplier',
       'temp_source', 'clouds_source', 'pressure_source', 'rain_source',
       'humidity_source', 'wind_source', 'temp_destination',
       'clouds_destination', 'pressure_destination', 'rain_destination',
       'humidity_destination', 'wind_destination', 'desti_Back Bay',
       'desti_Beacon Hill', 'desti_Boston University', 'desti_Fenway',
       'desti_Financial District', 'desti_Haymarket Square', 'desti_North End',
       'desti_North Station', 'desti_Northeastern University',
       'desti_South Station', 'desti_Theatre District', 'desti_West End',
       'src_Back Bay', 'src_Beacon Hill', 'src_Boston University',
       'src_Fenway', 'src_Financial District', 'src_Haymarket Square',
       'src_North End', 'src_North Station', 'src_Northeastern University',
       'src_South Station', 'src_Theatre District', 'src_West End',
       'pid_55c66225-fbe7-4fd5-9072-eab1ece5e23e',
       'pid_6c84fd89-3f11-478

In [11]:
x = data.drop('price', axis = 1)
y = data['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
print(x_train.shape , x_test.shape , y_train.shape , y_test.shape)

sc = StandardScaler()
sc.fit(x_train)

x_train = pd.DataFrame(sc.transform(x_train) ,columns =x.columns)
x_test =  pd.DataFrame(sc.transform(x_test) ,columns = x.columns)

(446583, 64) (191393, 64) (446583,) (191393,)


In [12]:
ols_object_bonus = smf.ols(formula = 'money_made_inv~out_prncp_inv+I(out_prncp_inv**2)+loan_amnt*out_prncp_inv+I(loan_amnt**2)+int_rate*loan_amnt*out_prncp_inv+I(int_rate**2)+term*out_prncp_inv+sub_grade*loan_amnt*term+annual_inc', data = not_zero)
model_bonus = ols_object_bonus.fit()
model_bonus.summary()

NameError: name 'smf' is not defined

In [21]:
rideshare.columns

Index(['id', 'timestamp', 'hour', 'day', 'month', 'datetime', 'timezone',
       'source', 'destination', 'cab_type', 'product_id', 'name', 'price',
       'distance', 'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperat

In [14]:
from sklearn.model_selection import train_test_split


In [24]:
train, test = train_test_split(data, shuffle=True, train_size=0.7)

In [25]:
train

Unnamed: 0,distance,cab_type,time_stamp,price,surge_multiplier,temp_source,clouds_source,pressure_source,rain_source,humidity_source,...,nm_Lux,nm_Lux Black,nm_Lux Black XL,nm_Lyft,nm_Lyft XL,nm_Shared,nm_UberPool,nm_UberX,nm_UberXL,nm_WAV
249231,3.44,1,1543244710344,17.0,1.0,39.394092,0.677495,1008.438031,0.008310,0.755468,...,0,0,0,0,0,0,0,0,0,1
343993,2.49,0,1543616288459,16.5,1.0,39.047285,0.677801,1008.448356,0.008297,0.765048,...,1,0,0,0,0,0,0,0,0,0
69148,1.11,1,1543411028318,26.0,1.0,39.090841,0.676730,1008.441912,0.008644,0.764054,...,0,0,0,0,0,0,0,0,0,0
190194,2.46,1,1544875808678,28.5,1.0,38.964379,0.679866,1008.453289,0.007343,0.767266,...,0,0,0,0,0,0,0,0,0,0
305331,0.54,1,1544904007067,10.5,1.0,39.394092,0.677495,1008.438031,0.008310,0.755468,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377749,2.67,0,1543460708150,16.5,1.0,39.082122,0.678432,1008.447820,0.007925,0.764073,...,0,0,0,0,1,0,0,0,0,0
84356,2.84,0,1544912112411,22.5,1.0,38.964379,0.679866,1008.453289,0.007343,0.767266,...,0,1,0,0,0,0,0,0,0,0
60982,0.61,1,1544773509534,7.5,1.0,39.067897,0.676711,1008.445239,0.008660,0.764837,...,0,0,0,0,0,0,1,0,0,0
29142,3.07,1,1543449821410,10.5,1.0,38.964379,0.679866,1008.453289,0.007343,0.767266,...,0,0,0,0,0,0,0,1,0,0


In [26]:
test

Unnamed: 0,distance,cab_type,time_stamp,price,surge_multiplier,temp_source,clouds_source,pressure_source,rain_source,humidity_source,...,nm_Lux,nm_Lux Black,nm_Lux Black XL,nm_Lyft,nm_Lyft XL,nm_Shared,nm_UberPool,nm_UberX,nm_UberXL,nm_WAV
144369,1.19,0,1544738710409,16.5,1.0,39.394092,0.677495,1008.438031,0.008310,0.755468,...,0,1,0,0,0,0,0,0,0,0
499253,2.31,0,1543808581712,16.5,1.0,39.047285,0.677801,1008.448356,0.008297,0.765048,...,1,0,0,0,0,0,0,0,0,0
18303,3.47,0,1543678077818,13.5,1.0,39.047744,0.679235,1008.459254,0.007738,0.763786,...,0,0,0,1,0,0,0,0,0,0
242833,1.50,1,1545020712490,26.5,1.0,38.964379,0.679866,1008.453289,0.007343,0.767266,...,0,0,0,0,0,0,0,0,0,0
222349,2.98,0,1543899177718,26.0,1.0,39.090841,0.676730,1008.441912,0.008644,0.764054,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570504,2.14,1,1543408283412,28.5,1.0,38.983403,0.677247,1008.441090,0.008657,0.767266,...,0,0,0,0,0,0,0,0,0,0
357621,2.98,0,1544746511868,19.5,1.0,38.983403,0.677247,1008.441090,0.008657,0.767266,...,0,0,0,0,1,0,0,0,0,0
10947,0.92,1,1543423447955,26.0,1.0,39.090841,0.676730,1008.441912,0.008644,0.764054,...,0,0,0,0,0,0,0,0,0,0
335394,3.36,0,1544829904346,19.5,1.0,39.035315,0.676998,1008.442811,0.008649,0.765545,...,1,0,0,0,0,0,0,0,0,0


In [27]:
train.to_csv('train.csv', index=False)

In [28]:
test.to_csv('test.csv', index=False)