<a href="https://colab.research.google.com/github/bobbyknit007/ML/blob/main/Regression/Uber_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [160]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from warnings import filterwarnings
filterwarnings('ignore')
from datetime import datetime
from math import radians, sin, cos, acos

In [161]:
df =pd.read_csv('/content/uber.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


Column unnamed, key looks wrong data- **remove them**
Pickup_datetime - Date is not so important may be Day of week and Time in HH:MM would be helpful
Latitude and Longitude - we need to convert it to distance

In [162]:
df.drop(['Unnamed: 0','key'], axis=1, inplace=True)

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        200000 non-null  float64
 1   pickup_datetime    200000 non-null  object 
 2   pickup_longitude   200000 non-null  float64
 3   pickup_latitude    200000 non-null  float64
 4   dropoff_longitude  199999 non-null  float64
 5   dropoff_latitude   199999 non-null  float64
 6   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 10.7+ MB


In [164]:
df.isnull().sum()

Unnamed: 0,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,1
dropoff_latitude,1
passenger_count,0


In [165]:
df.isna().sum()

Unnamed: 0,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,1
dropoff_latitude,1
passenger_count,0


In [166]:
df.shape

(200000, 7)

In [167]:
df.dropna(axis=0, inplace=True)

In [168]:
df.shape

(199999, 7)

In [169]:

def getDateTime(datetime_str):
  return datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S').day


In [170]:
df['week_day']=pd.to_datetime(df['pickup_datetime']).dt.weekday # df['pickup_datetime'].apply(lambda datetime_str: datetime.strptime(datetime_str, '%y-%m-%d %H:%M').day)
df['pickup_time']=pd.to_datetime(df['pickup_datetime']).dt.time

In [171]:
def calculatedistance(row):
  try:
    mlat = radians(float(row['pickup_latitude']))
    mlon = radians(float(row['pickup_longitude']))
    plat = radians(float(row['dropoff_latitude']))
    plon = radians(float(row['dropoff_longitude']))
    #print(row.index)
    return 6371.01 * acos( (sin(mlat)*sin(plat)) + (cos(mlat) * cos(plat) * (cos(mlon - plon))) )
  except:
    print('Errow while processing row {}',row[0])
    return None




In [172]:
df['distance_km']=df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']].apply(lambda row : calculatedistance(row) , axis=1)

Errow while processing row {} -73.95333862304686
Errow while processing row {} -73.964526
Errow while processing row {} -74.005702
Errow while processing row {} -73.989707
Errow while processing row {} -73.785954
Errow while processing row {} -74.00710500000001
Errow while processing row {} -74.03131866455078
Errow while processing row {} -73.97630500000001
Errow while processing row {} -73.981447
Errow while processing row {} -73.83461
Errow while processing row {} -73.911513
Errow while processing row {} -74.02305
Errow while processing row {} -73.995436
Errow while processing row {} -73.972443
Errow while processing row {} -73.949382
Errow while processing row {} -73.96036
Errow while processing row {} -73.911513
Errow while processing row {} -73.863175
Errow while processing row {} -73.984897
Errow while processing row {} -74.36973
Errow while processing row {} -73.981877
Errow while processing row {} -74.00558000000001
Errow while processing row {} -73.997148
Errow while processin

In [173]:
df.head(5)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,week_day,pickup_time,distance_km
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,3,19:52:06,1.683325
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,4,20:04:56,2.457594
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,0,21:45:00,5.036385
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,4,08:22:21,1.661686
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,3,17:47:00,4.475457


In [174]:
df.isnull().sum()

Unnamed: 0,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,0
dropoff_latitude,0
passenger_count,0
week_day,0
pickup_time,0
distance_km,51


In [175]:
df.isna().sum()

Unnamed: 0,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,0
dropoff_latitude,0
passenger_count,0
week_day,0
pickup_time,0
distance_km,51


In [176]:
df.shape

(199999, 10)

In [177]:
df.dropna(axis=0, inplace=True)

In [178]:
df.shape

(199948, 10)

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199948 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        199948 non-null  float64
 1   pickup_datetime    199948 non-null  object 
 2   pickup_longitude   199948 non-null  float64
 3   pickup_latitude    199948 non-null  float64
 4   dropoff_longitude  199948 non-null  float64
 5   dropoff_latitude   199948 non-null  float64
 6   passenger_count    199948 non-null  int64  
 7   week_day           199948 non-null  int32  
 8   pickup_time        199948 non-null  object 
 9   distance_km        199948 non-null  float64
dtypes: float64(6), int32(1), int64(1), object(2)
memory usage: 16.0+ MB
