# Importing  Dataset

In [1]:
import pandas as pd
df = pd.read_csv(r"./hyderabad_eta_data.csv")
df

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,distance_km,traffic_density,weather_condition,day_of_week,hour_of_day,ETA
0,17.312362,78.355540,17.278512,78.501811,17.587880,3,rainy,3,17,26.113020
1,17.485214,78.462570,17.274094,78.539004,24.357538,8,clear,5,14,58.715248
2,17.419598,78.561884,17.471876,78.375140,23.044667,7,rainy,6,8,59.316409
3,17.379598,78.519667,17.274864,78.487462,5.463097,3,clear,2,17,35.630253
4,17.246806,78.541968,17.281585,78.471524,5.328235,6,clear,1,2,56.713490
...,...,...,...,...,...,...,...,...,...,...
995,17.227475,78.497087,17.459589,78.432632,27.024522,9,clear,2,4,18.585042
996,17.475194,78.586984,17.247182,78.400320,4.462049,1,rainy,5,14,42.688225
997,17.241046,78.320687,17.292936,78.418372,10.507442,1,clear,5,12,11.273635
998,17.485071,78.317116,17.287014,78.458982,24.656616,4,rainy,5,12,13.180450


# Data Preprocessing

In [2]:
df.isna().sum()

start_lat            0
start_lng            0
end_lat              0
end_lng              0
distance_km          0
traffic_density      0
weather_condition    0
day_of_week          0
hour_of_day          0
ETA                  0
dtype: int64

In [3]:
df.corr(numeric_only = True)

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,distance_km,traffic_density,day_of_week,hour_of_day,ETA
start_lat,1.0,0.02931,0.014518,-0.029424,0.034785,-0.024754,0.055406,-0.014557,-0.030701
start_lng,0.02931,1.0,0.027262,-0.005791,0.032911,0.000297,-0.048353,0.014242,-0.007978
end_lat,0.014518,0.027262,1.0,-0.01356,-0.008858,-0.014043,-0.057856,-0.006245,0.061752
end_lng,-0.029424,-0.005791,-0.01356,1.0,-0.044812,0.034494,0.017789,-0.012494,0.009425
distance_km,0.034785,0.032911,-0.008858,-0.044812,1.0,-0.031475,0.066093,0.022646,-0.008686
traffic_density,-0.024754,0.000297,-0.014043,0.034494,-0.031475,1.0,-0.010457,0.018572,-0.009196
day_of_week,0.055406,-0.048353,-0.057856,0.017789,0.066093,-0.010457,1.0,0.02001,0.07228
hour_of_day,-0.014557,0.014242,-0.006245,-0.012494,0.022646,0.018572,0.02001,1.0,0.009233
ETA,-0.030701,-0.007978,0.061752,0.009425,-0.008686,-0.009196,0.07228,0.009233,1.0


# Data Cleaning

# Data Cleaning

In [4]:
df['weather_condition'].unique()
df['weather_condition'] = df['weather_condition'].map({ "rainy":1, "foggy":2, "clear":3})
df.weather_condition

0      1
1      3
2      1
3      3
4      3
      ..
995    3
996    1
997    3
998    1
999    1
Name: weather_condition, Length: 1000, dtype: int64

In [5]:
# Drop distance_km & traffice_density
df.drop([ 'distance_km','traffic_density'], axis=1, inplace=True)

# Model Building

#### splitting Input and Output

In [6]:
df.head()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,weather_condition,day_of_week,hour_of_day,ETA
0,17.312362,78.35554,17.278512,78.501811,1,3,17,26.11302
1,17.485214,78.46257,17.274094,78.539004,3,5,14,58.715248
2,17.419598,78.561884,17.471876,78.37514,1,6,8,59.316409
3,17.379598,78.519667,17.274864,78.487462,3,2,17,35.630253
4,17.246806,78.541968,17.281585,78.471524,3,1,2,56.71349


In [7]:
X = df.drop('ETA', axis = 1)
Y = df['ETA']

# import ML library

In [8]:
# !pip install scikit-learn

In [9]:
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_squared_error


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 23)

# Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit( X_train, Y_train )
Y_prediction = lr.predict( X_test )
print(Y_prediction)
# from sklearn.metrics import mean_squared_error
# mean_squared_error( Y_test, Y_prediction )

[35.21595854 33.02259368 33.23323621 35.32015914 34.52174725 37.18360561
 34.76853566 37.42759515 35.94783413 33.89382638 31.77979905 31.98226443
 33.35400739 38.59005958 35.60943341 34.55938218 34.9608906  33.59108683
 35.02515788 37.7759728  35.57152734 37.33364482 34.02596659 35.35879381
 36.42478122 37.17376285 33.51925236 32.85886322 37.76658648 35.14154156
 33.52350009 35.12470128 34.6029801  35.67168465 36.14790621 30.96980794
 37.2102776  36.92854578 35.39734186 33.18814766 35.53370351 34.03745729
 36.41944701 36.17630529 36.06905447 36.03455393 34.62992298 32.35807927
 37.6125646  35.96227019 33.49331688 34.2842563  36.03461527 37.50303008
 34.25829037 35.39600299 35.12422487 36.22609826 34.643424   35.57983651
 36.64943378 34.77052255 32.37662979 33.96877887 38.83802121 35.78480185
 36.52704233 35.32974498 35.51195838 34.37450457 38.94123039 37.74860246
 37.13864589 34.70333737 35.79591134 35.66408214 37.03939713 31.93887644
 36.69530703 34.5477756  33.96303879 34.9170527  36

# Pickle the Model

In [11]:
import pickle 

In [12]:
pickle.dump(lr, open("estimator.pkl", 'wb'))

In [13]:
model = pickle.load( open("estimator.pkl","rb"))

# Evalution

In [14]:
from sklearn.metrics import mean_squared_error
mean_squared_error( Y_test, Y_prediction )

227.07258113670778

# KNN Regression

In [15]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit( X_train, Y_train )
Y_prediction = knn.predict( X_test )
# print(Y_prediction)
mean_squared_error( Y_test, Y_prediction )

257.4764670296822

# Decision Tree

In [16]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit( X_train, Y_train )
Y_prediction = dt.predict( X_test )
# print(Y_prediction)
mean_squared_error( Y_test, Y_prediction )

427.92975111470497

# SVR

In [17]:
from sklearn.svm import SVR
svr = SVR()
svr.fit( X_train, Y_train )
Y_prediction = svr.predict( X_test )
# print(Y_prediction)
mean_squared_error( Y_test, Y_prediction )

226.5874233263822

# Chetha

In [18]:
import pandas as pd
import numpy as np

# Provided data
data = {
    'start_lat': [17.31236204, 17.48521429, 17.41959818, 17.37959755],
    'start_lng': [78.35553988, 78.46257028, 78.56188375, 78.51966747],
    'end_lat': [17.27851171, 17.27409364, 17.47187637, 17.27486386],
    'end_lng': [78.5018109, 78.53900442, 78.37514037, 78.48746223],
    'distance_km': [17.58788047, 24.35753755, 23.04466696, 5.463097236],
    'traffic_density': [3, 8, 7, 3],
    'weather_condition': ['rainy', 'clear', 'rainy', 'clear'],
    'day_of_week': [3, 5, 6, 2],
    'hour_of_day': [17, 14, 8, 17],
    'ETA': [26.11301969, 58.71524816, 59.31640868, 35.63025265]
}

# Generate additional data
additional_data = {
    'start_lat': [17.41236204, 17.49521429, 17.42959818, 17.38959755, 17.45921429],
    'start_lng': [78.36553988, 78.47257028, 78.57188375, 78.52966747, 78.58257028],
    'end_lat': [17.28851171, 17.28409364, 17.48187637, 17.28486386, 17.29409364],
    'end_lng': [78.5118109, 78.54900442, 78.38514037, 78.49746223, 78.50900442],
    'distance_km': [18.58788047, 25.35753755, 24.04466696, 6.463097236, 27.35753755],
    'traffic_density': [4, 9, 6, 2, 7],
    'weather_condition': ['cloudy', 'rainy', 'clear', 'cloudy', 'clear'],
    'day_of_week': [4, 6, 1, 3, 5],
    'hour_of_day': [18, 12, 10, 15, 13],
    'ETA': [27.11301969, 60.71524816, 57.31640868, 36.63025265, 63.71524816]
}

# Combine the provided and additional data
for key in data:
    data[key].extend(additional_data[key])

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


   start_lat  start_lng    end_lat    end_lng  distance_km  traffic_density  \
0  17.312362  78.355540  17.278512  78.501811    17.587880                3   
1  17.485214  78.462570  17.274094  78.539004    24.357538                8   
2  17.419598  78.561884  17.471876  78.375140    23.044667                7   
3  17.379598  78.519667  17.274864  78.487462     5.463097                3   
4  17.412362  78.365540  17.288512  78.511811    18.587880                4   
5  17.495214  78.472570  17.284094  78.549004    25.357538                9   
6  17.429598  78.571884  17.481876  78.385140    24.044667                6   
7  17.389598  78.529667  17.284864  78.497462     6.463097                2   
8  17.459214  78.582570  17.294094  78.509004    27.357538                7   

  weather_condition  day_of_week  hour_of_day        ETA  
0             rainy            3           17  26.113020  
1             clear            5           14  58.715248  
2             rainy            6

In [19]:
start_lat = float(input("Enter the star latttude:"))
start_lang = float(input("Enter the star lagitude:"))
end_lat = float(input("Enter the dest lattttude:"))
end_lang = float(input("Enter the end langitude:"))
dist = float(input("Enter the distance:"))
density = int(input("Enter the density:"))
weather = int(input("Enter the star lat:"))
day  = int(input("Enter the day:"))
hour = int(input("Enter the hour:"))
time = lr.predict([[start_lat,start_lang ,end_lat,end_lang,dist,density,weather,day,hour]])
print("The Estimated time is",time)

ValueError: could not convert string to float: ''

: 