## Jams Level Prediction

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit


In [2]:
jams = pd.read_csv('./../raw/aggregate_median_jams_Kota Bogor.csv')
jams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 14 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 102322 non-null  int64  
 1   time                       102322 non-null  object 
 2   kemendagri_kabupaten_kode  102322 non-null  float64
 3   kemendagri_kabupaten_nama  102322 non-null  object 
 4   street                     100529 non-null  object 
 5   level                      102322 non-null  int64  
 6   median_length              102322 non-null  float64
 7   median_delay               102322 non-null  float64
 8   median_speed_kmh           102322 non-null  float64
 9   total_records              102322 non-null  int64  
 10  id                         102322 non-null  int64  
 11  date                       102322 non-null  object 
 12  median_level               102322 non-null  float64
 13  geometry                   10

In [3]:
jams.sample(2)

Unnamed: 0.1,Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,date,median_level,geometry
46892,46892,2022-08-03 10:00:00.000,32.71,KOTA BOGOR,N9 KS Tubun,3,539.0,176.0,7.65,16,35269728,2022-08-03,3.0,"MULTILINESTRING ((106.81065 -6.564263, 106.810..."
4083,4083,2022-07-08 17:00:00.000,32.71,KOTA BOGOR,Flyover Martadinata,3,387.0,107.0,9.75,3,33599516,2022-07-08,3.0,"MULTILINESTRING ((106.794855 -6.581581, 106.79..."


In [4]:
# drop kolom yang tidak dibutuhkan
jams.drop(columns=['Unnamed: 0', 'id', 'kemendagri_kabupaten_kode', 'kemendagri_kabupaten_nama',
                        'median_level'], inplace=True)
jams.shape

(102322, 9)

### Checking duplicate & missing values

In [5]:
# cek data duplikat
jams[jams.duplicated]


Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,total_records,date,geometry


In [6]:
# cek missing values
jams.isna().sum()

time                   0
street              1793
level                  0
median_length          0
median_delay           0
median_speed_kmh       0
total_records          0
date                   0
geometry               0
dtype: int64

In [7]:
1793 / jams.shape[0] * 100

1.7523113308965814

> Kita akan menerapkan metode KNN untuk mengisi nilai kosong pada kolom street dengan memanfaatkan informatsi latitude longitude yang didapatkan dari proses feature engineering

### Feature Engineering

Adding some potential feature:
- hour
- week
- day
- isWeekend (True/False)
- latitude
- longitude

In [8]:
# Ekstrak nilai time dan jam nya
jams['time'] = jams['time'].astype('datetime64')
jams['hour'] = jams.time.dt.hour
jams['time'] = jams.time.dt.time

# Ekstrak hari dan statusnya sebagai weekend atau weekday
jams['date'] = jams['date'].astype('datetime64')
jams['week'] = jams['date'].dt.to_period('W').apply(lambda x: x.start_time)
jams['day'] = jams.date.dt.day_name()
jams['date'] = jams.date.dt.date
jams['isWeekend'] = np.where(jams.day.isin(['Saturday', 'Sunday']), 1, 0)

In [9]:
# Perbaiki order dari hari
day_ordered = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
]
jams['day'] = jams.day.astype('category')
jams['day'] = jams.day.cat.set_categories(day_ordered, ordered=True)

In [10]:
jams.level.value_counts()

2    35460
3    31038
1    19931
4    14403
5     1490
Name: level, dtype: int64

In [11]:
def extract_long_lat(s):
    add_idx = 2 if s[0]=='M' else 1
    first_idx = s.find('((') if add_idx==2 else s.find('(')
    last_idx = s.find(')')

    long_lat = [float(x)  for pairs in s[first_idx+add_idx : last_idx].split(', ') for x in pairs.split(' ')]
    longitude, latitude = [], []
    for idx, x in enumerate(long_lat):
        if idx%2==0:
            latitude.append(x)
        else:
            longitude.append(x)
            

    return np.mean(longitude), np.mean(latitude)

In [12]:
jams['geometry'] = jams.geometry.apply(extract_long_lat)
jams['longitude'] = jams.geometry.apply(lambda x: x[0])
jams['latitude'] = jams.geometry.apply(lambda x: x[1])
jams.drop(columns=['geometry'], inplace=True)
jams.sample(3)


Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,total_records,date,hour,week,day,isWeekend,longitude,latitude
48754,12:00:00,N6 Jalan Raya Baru,2,975.0,104.0,19.56,2,2022-08-04,12,2022-08-01,Thursday,0,-6.553497,106.776649
68070,12:00:00,N9 Jalan Raya Tajur,3,604.0,88.0,13.77,1,2022-08-15,12,2022-08-15,Monday,0,-6.623267,106.818561
47381,14:00:00,TB M Falak,1,1072.0,76.5,19.165,14,2022-08-03,14,2022-08-01,Wednesday,0,-6.58345,106.775803


### Handling Missing Values

In [13]:
miss_idx = pd.isnull(jams.street)

imputer = KNeighborsClassifier(n_neighbors=3)
imputer.fit(jams.loc[~miss_idx, ['longitude', 'latitude']], jams.loc[~miss_idx, 'street'])
jams.loc[miss_idx, 'street']= imputer.predict(jams.loc[miss_idx, ['longitude', 'latitude']])
jams.isna().sum()

time                0
street              0
level               0
median_length       0
median_delay        0
median_speed_kmh    0
total_records       0
date                0
hour                0
week                0
day                 0
isWeekend           0
longitude           0
latitude            0
dtype: int64

### Rearrange Jams Level
- Low: level 1 and 2
- Medium: level 3
- High: level 4 and 5

In [14]:
jams['level'] = np.where(
    jams.level <= 2, 1,
    np.where(jams.level==3, 2, 3)
)

### EDA

In [15]:
px.bar(
    jams.groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams for each days'
)

In [16]:
px.bar(
    jams[jams.level==2].groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams Level Medium'
)

In [17]:
px.bar(
    jams[jams.level==3].groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams Level High'
)

> Kemacetan paling sering terjadi pada hari sabtu. Selain sabtu kemacetan agak lumayan sering terjadi pada hari jumat dan minggu.

In [18]:
px.bar(
    jams.groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams for each hours'
)

In [19]:
px.bar(
    jams[jams.level==2].groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams Level Medium'
)

In [20]:
px.bar(
    jams[jams.level==3].groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams Level High'
)

> Dari barchart diatas saya mengambil kesimpulan bahwa kemacetan mencapai puncaknya pada jam 17. Dan jam sibuknya berkisar antara jam 13 - 17.

#### Create Rush Hour Feature

In [21]:
# Membuat feature tambahan untuk rush hour
jams['isRushHour'] = np.where((jams.hour>=13) & (jams.hour<=17), 1, 0)
jams.isRushHour.value_counts()

0    61653
1    40669
Name: isRushHour, dtype: int64

In [22]:
jams.level.value_counts() / jams.shape[0] * 100

1    54.134008
2    30.333653
3    15.532339
Name: level, dtype: float64

> Imbalance Dataset Problem

### Train-Test Split Dataset

In [23]:
col_feature = list(set(jams.columns)-set('level'))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=123)

for train_idx, test_idx in sss.split(jams[col_feature], jams['level']):
    jams['is_train'] = jams.index.isin(train_idx).astype(int)

jams.is_train.value_counts() / jams.shape[0] * 100

1    69.999609
0    30.000391
Name: is_train, dtype: float64

In [24]:
jams.to_csv('./../dataset/jams_bogor.csv', index=False)

jams_loaded = pd.read_csv('./../dataset/jams_bogor.csv')
jams_loaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   time              102322 non-null  object 
 1   street            102322 non-null  object 
 2   level             102322 non-null  int64  
 3   median_length     102322 non-null  float64
 4   median_delay      102322 non-null  float64
 5   median_speed_kmh  102322 non-null  float64
 6   total_records     102322 non-null  int64  
 7   date              102322 non-null  object 
 8   hour              102322 non-null  int64  
 9   week              102322 non-null  object 
 10  day               102322 non-null  object 
 11  isWeekend         102322 non-null  int64  
 12  longitude         102322 non-null  float64
 13  latitude          102322 non-null  float64
 14  isRushHour        102322 non-null  int64  
 15  is_train          102322 non-null  int64  
dtypes: float64(5), int64