In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import haversine as hs

In [None]:
df = pd.read_csv('1_uber.csv')
df.head()

In [None]:
df = df.drop(['Unnamed: 0', 'key', 'pickup_datetime'], axis=1)
df.shape

In [None]:
df.dtypes

In [None]:
df.dropna()
df.isnull().sum()

In [None]:
df.loc[:, 'dropoff_longitude'] = df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median())
df.loc[:, 'dropoff_latitude'] = df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean())
df.isnull().sum()

In [None]:
fig = px.box(df, y='fare_amount')
fig.show()

In [None]:
df.describe()[['fare_amount', 'passenger_count']]

In [9]:
def remove_outlier(df1, col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1 - 1.5 * IQR
    upper_whisker = Q3 + 1.5 * IQR
    df[col] = np.clip(df1[col], lower_whisker, upper_whisker)
    return df1

In [10]:
def treat_outliers_all(df1, col_list):
    for c in col_list:
        df1 = remove_outlier(df1, c)
    return df1

In [None]:
df = treat_outliers_all(df, df.columns)
df.plot(kind='box', subplots=True, layout=(7, 2), figsize=(15, 20))

In [12]:
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
    loc1 = (df['pickup_latitude'][pos], df['pickup_longitude'][pos])
    loc2 = (df['dropoff_latitude'][pos], df['dropoff_longitude'][pos])
    c = hs.haversine(loc1, loc2)
    travel_dist.append(c)
df['dist_travel_km'] = travel_dist

In [None]:
df = df.loc[(df.dist_travel_km >= 1) & (df.dist_travel_km <= 130)]
print('Remaining observations in the dataset:', df.shape)

In [None]:
sns.heatmap(df.isnull())

In [None]:
corr = df.corr()
print(corr)
sns.heatmap(df.corr(), annot=True)

In [16]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_travel_km']]
y = df['fare_amount']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [None]:
regression = LinearRegression()
regression.fit(X_train, y_train)
regression.intercept_, regression.coef_

In [None]:
prediction = regression.predict(X_test)
r2 = r2_score(y_test, prediction)
MSE = mean_squared_error(y_test, prediction)
RMSE = np.sqrt(MSE)
r2, MSE, RMSE

In [None]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
R2_Random = r2_score(y_test, y_pred)
MSE_Random = mean_squared_error(y_test, y_pred)
RMSE_Random = np.sqrt(MSE_Random)
R2_Random, MSE_Random, RMSE_Random

In [None]:
print('Analysis complete!')