## Jams Level Prediction

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit


In [2]:
jams = pd.read_csv('./../raw/aggregate_median_jams_Kota Bogor.csv')
jams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 14 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 102322 non-null  int64  
 1   time                       102322 non-null  object 
 2   kemendagri_kabupaten_kode  102322 non-null  float64
 3   kemendagri_kabupaten_nama  102322 non-null  object 
 4   street                     100529 non-null  object 
 5   level                      102322 non-null  int64  
 6   median_length              102322 non-null  float64
 7   median_delay               102322 non-null  float64
 8   median_speed_kmh           102322 non-null  float64
 9   total_records              102322 non-null  int64  
 10  id                         102322 non-null  int64  
 11  date                       102322 non-null  object 
 12  median_level               102322 non-null  float64
 13  geometry                   10

In [3]:
jams.sample(2)

Unnamed: 0.1,Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,median_length,median_delay,median_speed_kmh,total_records,id,date,median_level,geometry
10658,10658,2022-07-13 11:00:00.000,32.71,KOTA BOGOR,Pahlawan,3,542.0,95.0,13.09,39,33894404,2022-07-13,3.0,"MULTILINESTRING ((106.794985 -6.607735, 106.79..."
4326,4326,2022-07-08 18:00:00.000,32.71,KOTA BOGOR,Tumenggung Wiradireja,1,753.0,104.0,11.06,2,33599765,2022-07-08,1.0,"LINESTRING (106.822489 -6.588903, 106.82315 -6..."


In [4]:
# drop kolom yang tidak dibutuhkan
jams.drop(columns=['Unnamed: 0', 'id', 'kemendagri_kabupaten_kode', 'kemendagri_kabupaten_nama',
                        'median_level'], inplace=True)
jams.shape

(102322, 9)

### Checking duplicate & missing values

In [5]:
# cek data duplikat
jams[jams.duplicated]


Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,total_records,date,geometry


In [6]:
# cek missing values
jams.isna().sum()

time                   0
street              1793
level                  0
median_length          0
median_delay           0
median_speed_kmh       0
total_records          0
date                   0
geometry               0
dtype: int64

In [7]:
1793 / jams.shape[0] * 100

1.7523113308965814

> Kita akan menerapkan metode KNN untuk mengisi nilai kosong pada kolom street dengan memanfaatkan informatsi latitude longitude yang didapatkan dari proses feature engineering

### Feature Engineering

Adding some potential feature:
- hour
- week
- day
- isWeekend (True/False)
- month
- latitude
- longitude

In [8]:
# Ekstrak nilai time dan jam nya
jams['time'] = jams['time'].astype('datetime64')
jams['hour'] = jams.time.dt.hour
jams['time'] = jams.time.dt.time

# Ekstrak hari dan statusnya sebagai weekend atau weekday
jams['date'] = jams['date'].astype('datetime64')
jams['week'] = jams['date'].dt.isocalendar().week
jams['day'] = jams.date.dt.day_name()
jams['month'] = jams.date.dt.month
jams['date'] = jams.date.dt.date
jams['isWeekend'] = np.where(jams.day.isin(['Saturday', 'Sunday']), 1, 0)

In [9]:
# Perbaiki order dari hari
day_ordered = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
]
jams['day'] = jams.day.astype('category')
jams['day'] = jams.day.cat.set_categories(day_ordered, ordered=True)

In [10]:
jams.level.value_counts()

2    35460
3    31038
1    19931
4    14403
5     1490
Name: level, dtype: int64

In [11]:
def extract_long_lat(s):
    add_idx = 2 if s[0]=='M' else 1
    first_idx = s.find('((') if add_idx==2 else s.find('(')
    last_idx = s.find(')')

    long_lat = [float(x)  for pairs in s[first_idx+add_idx : last_idx].split(', ') for x in pairs.split(' ')]
    longitude, latitude = [], []
    for idx, x in enumerate(long_lat):
        if idx%2==0:
            latitude.append(x)
        else:
            longitude.append(x)
            

    return np.mean(longitude), np.mean(latitude)

In [12]:
jams['geometry'] = jams.geometry.apply(extract_long_lat)
jams['longitude'] = jams.geometry.apply(lambda x: x[0])
jams['latitude'] = jams.geometry.apply(lambda x: x[1])
jams.drop(columns=['geometry'], inplace=True)
jams.sample(3)


Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,total_records,date,hour,week,day,month,isWeekend,longitude,latitude
99660,17:00:00,N6 Jalan Raya Baru,2,934.0,85.0,21.35,3,2022-09-03,17,35,Saturday,9,1,-6.554413,106.777002
14619,19:00:00,Ir Haji Juanda,4,196.0,67.0,5.435,2,2022-07-15,19,28,Friday,7,0,-6.60416,106.796635
87306,18:00:00,Jalan Empang,4,346.0,183.0,5.42,3,2022-08-27,18,34,Saturday,8,1,-6.605964,106.795666


### Handling Missing Values

In [13]:
miss_idx = pd.isnull(jams.street)

imputer = KNeighborsClassifier(n_neighbors=3)
imputer.fit(jams.loc[~miss_idx, ['longitude', 'latitude']], jams.loc[~miss_idx, 'street'])
jams.loc[miss_idx, 'street']= imputer.predict(jams.loc[miss_idx, ['longitude', 'latitude']])
jams.isna().sum()

time                0
street              0
level               0
median_length       0
median_delay        0
median_speed_kmh    0
total_records       0
date                0
hour                0
week                0
day                 0
month               0
isWeekend           0
longitude           0
latitude            0
dtype: int64

### Rearrange Jams Level
- Low: level 1 and 2
- Medium: level 3
- High: level 4 and 5

In [14]:
jams['level'] = np.where(
    jams.level <= 2, 1,
    np.where(jams.level==3, 2, 3)
)

### EDA

In [15]:
px.bar(
    jams.groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams for each days'
)

In [16]:
px.bar(
    jams[jams.level==2].groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams Level Medium'
)

In [17]:
px.bar(
    jams[jams.level==3].groupby('day')['level'].count().reset_index(),
    x='day',
    y='level',
    title='Total Jams Level High'
)

> Kemacetan paling sering terjadi pada hari sabtu. Selain sabtu kemacetan agak lumayan sering terjadi pada hari jumat dan minggu.

In [18]:
px.bar(
    jams.groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams for each hours'
)

In [19]:
px.bar(
    jams[jams.level==2].groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams Level Medium'
)

In [20]:
px.bar(
    jams[jams.level==3].groupby('hour')['level'].count().reset_index(),
    x='hour',
    y='level',
    title='Total Jams Level High'
)

> Dari barchart diatas saya mengambil kesimpulan bahwa kemacetan mencapai puncaknya pada jam 17. Dan jam sibuknya berkisar antara jam 13 - 17.

#### Create Rush Hour Feature

In [21]:
# Membuat feature tambahan untuk rush hour
jams['isRushHour'] = np.where((jams.hour>=13) & (jams.hour<=17), 1, 0)
jams.isRushHour.value_counts()

0    61653
1    40669
Name: isRushHour, dtype: int64

In [22]:
jams.level.value_counts() / jams.shape[0] * 100

1    54.134008
2    30.333653
3    15.532339
Name: level, dtype: float64

> Imbalance Dataset Problem

### Data Transformation

transform
- hour
- week
- day
- month

In [23]:
# Create the theta values
jams['hour_theta'] = jams.hour * 2 * np.pi / 24
jams['day'] = jams.date.astype('datetime64').dt.day
jams['day_theta'] = jams.day * 2 * np.pi / 7
jams['week_theta'] = jams.week * 2 * np.pi / 52
jams['month_theta'] = jams.month * 2 * np.pi / 12

# Create sine and cosine dari theta yang telah sebelumnya
jams['hour_sin'] = np.sin(jams.hour_theta)
jams['hour_cos'] = np.cos(jams.hour_theta)
jams['day_sin'] = np.sin(jams.day_theta)
jams['day_cos'] = np.cos(jams.day_theta)
jams['week_sin'] = np.sin(jams.week_theta)
jams['week_cos'] = np.cos(jams.week_theta)
jams['month_sin'] = np.sin(jams.month_theta)
jams['month_cos'] = np.cos(jams.month_theta)

### Drop feature

In [24]:
drop_features = [
    'street', 'time', 'date', 'hour', 'week', 'day', 'month', 'hour_theta', 
    'day_theta', 'week_theta', 'month_theta'
    ]
jams.drop(columns=drop_features, inplace=True)
jams.columns

Index(['level', 'median_length', 'median_delay', 'median_speed_kmh',
       'total_records', 'isWeekend', 'longitude', 'latitude', 'isRushHour',
       'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'week_sin', 'week_cos',
       'month_sin', 'month_cos'],
      dtype='object')

In [27]:
jams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level             102322 non-null  int32  
 1   median_length     102322 non-null  float64
 2   median_delay      102322 non-null  float64
 3   median_speed_kmh  102322 non-null  float64
 4   total_records     102322 non-null  int64  
 5   isWeekend         102322 non-null  int32  
 6   longitude         102322 non-null  float64
 7   latitude          102322 non-null  float64
 8   isRushHour        102322 non-null  int32  
 9   hour_sin          102322 non-null  float64
 10  hour_cos          102322 non-null  float64
 11  day_sin           102322 non-null  float64
 12  day_cos           102322 non-null  float64
 13  week_sin          102322 non-null  Float64
 14  week_cos          102322 non-null  Float64
 15  month_sin         102322 non-null  float64
 16  month_cos         10

### Train-Test Split Dataset

In [28]:
col_feature = list(set(jams.columns)-set('level'))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=123)

for train_idx, test_idx in sss.split(jams[col_feature], jams['level']):
    jams['is_train'] = jams.index.isin(train_idx).astype(int)

jams.is_train.value_counts() / jams.shape[0] * 100

1    69.999609
0    30.000391
Name: is_train, dtype: float64

In [29]:
# jams.to_csv('./../dataset/jams_bogor.csv', index=False)

jams_loaded = pd.read_csv('./../dataset/jams_bogor.csv')
jams_loaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level             102322 non-null  int64  
 1   median_length     102322 non-null  float64
 2   median_delay      102322 non-null  float64
 3   median_speed_kmh  102322 non-null  float64
 4   total_records     102322 non-null  int64  
 5   isWeekend         102322 non-null  int64  
 6   longitude         102322 non-null  float64
 7   latitude          102322 non-null  float64
 8   isRushHour        102322 non-null  int64  
 9   hour_sin          102322 non-null  float64
 10  hour_cos          102322 non-null  float64
 11  day_sin           102322 non-null  float64
 12  day_cos           102322 non-null  float64
 13  week_sin          102322 non-null  float64
 14  week_cos          102322 non-null  float64
 15  month_sin         102322 non-null  float64
 16  month_cos         10