In [7]:
import requests
import pandas as pd
import numpy as np

# Define the Overpass API URL
overpass_url = "https://www.overpass-api.de/api/interpreter"

# Read the CSV file into a DataFrame
csv_file = "lat_lon_data.csv"
data = pd.read_csv(csv_file, sep=';', decimal=',')

data.head()

Unnamed: 0,id_titik_mulai,id_titik_akhir,start_latitude,start_longitude,end_latitude,end_longitude
0,21390008,1425033102,51.434928,-0.161176,51.434975,-0.16109
1,1677092762,579493410,51.62399,-0.176398,51.623811,-0.176424
2,26486694,1930267566,51.45253,-0.152437,51.452749,-0.151896
3,1111592522,3775231113,51.465851,-0.154188,51.465654,-0.155334
4,5940503398,5940503394,51.530289,-0.228343,51.530197,-0.227949


In [8]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

In [12]:
# Convert latitude and longitude columns to numeric values
data['start_latitude'] = pd.to_numeric(data['start_latitude'])
data['start_longitude'] = pd.to_numeric(data['start_longitude'])
data['end_latitude'] = pd.to_numeric(data['end_latitude'])
data['end_longitude'] = pd.to_numeric(data['end_longitude'])

# Calculate distance for each row using the haversine function
data['distance_km'] = data.apply(lambda row: haversine(row['start_latitude'], row['start_longitude'], row['end_latitude'], row['end_longitude']), axis=1)
data['distance_meter'] = data['distance_km'] * 1000

In [13]:
data

Unnamed: 0,id_titik_mulai,id_titik_akhir,start_latitude,start_longitude,end_latitude,end_longitude,distance_km,distance_meter
0,21390008,1425033102,51.434928,-0.161176,51.434975,-0.161090,0.007914,7.914363
1,1677092762,579493410,51.623990,-0.176398,51.623811,-0.176424,0.019985,19.984653
2,26486694,1930267566,51.452530,-0.152437,51.452749,-0.151896,0.044685,44.684593
3,1111592522,3775231113,51.465851,-0.154188,51.465654,-0.155334,0.082340,82.340453
4,5940503398,5940503394,51.530289,-0.228343,51.530197,-0.227949,0.029118,29.117877
...,...,...,...,...,...,...,...,...
929,260381462,119606068,51.446690,-0.112805,51.446963,-0.112949,0.031974,31.973691
930,1579000949,5827426460,51.544321,0.015001,51.544286,0.014860,0.010526,10.526110
931,130193170,5827239128,51.543784,0.012916,51.543698,0.012592,0.024337,24.337467
932,119606068,873734072,51.446963,-0.112949,51.447102,-0.113009,0.016016,16.016372


In [14]:
data.to_csv('distance data.csv', index=False, sep=';', decimal=',')

In [15]:
# Remove outliers from 'rerata_kecepatan' column

Q1 = data['distance_meter'].quantile(0.25)
Q3 = data['distance_meter'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data = data[(data['distance_meter'] >= lower_bound) & (data['distance_meter'] <= upper_bound)]


In [16]:
data

Unnamed: 0,id_titik_mulai,id_titik_akhir,start_latitude,start_longitude,end_latitude,end_longitude,distance_km,distance_meter
0,21390008,1425033102,51.434928,-0.161176,51.434975,-0.161090,0.007914,7.914363
1,1677092762,579493410,51.623990,-0.176398,51.623811,-0.176424,0.019985,19.984653
2,26486694,1930267566,51.452530,-0.152437,51.452749,-0.151896,0.044685,44.684593
3,1111592522,3775231113,51.465851,-0.154188,51.465654,-0.155334,0.082340,82.340453
4,5940503398,5940503394,51.530289,-0.228343,51.530197,-0.227949,0.029118,29.117877
...,...,...,...,...,...,...,...,...
929,260381462,119606068,51.446690,-0.112805,51.446963,-0.112949,0.031974,31.973691
930,1579000949,5827426460,51.544321,0.015001,51.544286,0.014860,0.010526,10.526110
931,130193170,5827239128,51.543784,0.012916,51.543698,0.012592,0.024337,24.337467
932,119606068,873734072,51.446963,-0.112949,51.447102,-0.113009,0.016016,16.016372
