# Predict Traffic Jam Level in Kota Bandung using Machine Learning Modelling

<div align="center">
<img src="https://i.imgur.com/P5PeqCs.jpg">
</div>

# Initialization

In [None]:
!pip install geopandas



In [None]:
!pip install rfpimp



In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import geopandas
import json
from shapely.geometry import LineString, MultiLineString

df = pd.read_csv('drive/MyDrive/Traffic_Jams.csv')
df.head(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,avg_length,avg_delay,avg_speed_kmh,total_records,id,date,geometry
0,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Van Deventer,3,176.0,106.0,4.92,1,11847,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.61..."
1,2019-05-31 23:00:00,32.73,KOTA BANDUNG,HOS Cokroaminoto,3,307.0,144.0,6.13,1,14892,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.59..."
2,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Sukaati,2,207.0,60.0,6.11,1,11803,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.62..."
3,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Lama,3,220.0,161.5,3.931,10,11941,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[..."
4,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Raya,3,1453.4839,475.6129,8.523226,31,11942,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[..."


In [None]:
list(df.columns)

['time',
 'kemendagri_kabupaten_kode',
 'kemendagri_kabupaten_nama',
 'street',
 'level',
 'avg_length',
 'avg_delay',
 'avg_speed_kmh',
 'total_records',
 'id',
 'date',
 'geometry']

In [None]:
df.shape

(136807, 12)

# Exploratory Data Analysis

## Data Wrangling

In [None]:
df.isnull().mean()


# No missing values except for the street feature. No treatment needed for this missing value because street feature will be
# replaced by Sub Wilayah Kota in the next section.

time                         0.000000
kemendagri_kabupaten_kode    0.000000
kemendagri_kabupaten_nama    0.000000
street                       0.007916
level                        0.000000
avg_length                   0.000000
avg_delay                    0.000000
avg_speed_kmh                0.000000
total_records                0.000000
id                           0.000000
date                         0.000000
geometry                     0.000000
dtype: float64

## Data Transformation: Time to Day and Hour

In [None]:
# Day and Hour will be more representative and helpful for the model compared to Time as it might reveal the seasonal pattern.

df['hour'] = df['time'].apply(lambda x : x[11:13]) # Get hours (categorical var)
df['day'] = df['time'].apply(lambda x : datetime.strptime(x[:10], '%Y-%m-%d').strftime('%A')) # Get day
df.head(10)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,avg_length,avg_delay,avg_speed_kmh,total_records,id,date,geometry,hour,day
0,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Van Deventer,3,176.0,106.0,4.92,1,11847,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.61...",16,Friday
1,2019-05-31 23:00:00,32.73,KOTA BANDUNG,HOS Cokroaminoto,3,307.0,144.0,6.13,1,14892,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.59...",23,Friday
2,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Sukaati,2,207.0,60.0,6.11,1,11803,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[107.62...",16,Friday
3,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Lama,3,220.0,161.5,3.931,10,11941,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",17,Friday
4,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Raya,3,1453.4839,475.6129,8.523226,31,11942,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",17,Friday
5,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Raya,4,1266.6395,822.4186,4.809651,86,11943,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",17,Friday
6,2019-05-31 18:00:00,32.73,KOTA BANDUNG,Citayam Raya,2,299.0,71.0,8.17,1,12843,2019-05-31,"{""type"": ""LineString"", ""coordinates"": [[106.80...",18,Friday
7,2019-05-31 18:00:00,32.73,KOTA BANDUNG,Gatot Subroto,2,1217.0602,100.216866,19.438795,83,12882,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",18,Friday
8,2019-05-31 18:00:00,32.73,KOTA BANDUNG,Gatot Subroto,3,770.7159,130.03409,12.962614,88,12883,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",18,Friday
9,2019-05-31 18:00:00,32.73,KOTA BANDUNG,Jalan Jurang,2,625.0,123.5,9.976666,6,12934,2019-05-31,"{""type"": ""MultiLineString"", ""coordinates"": [[[...",18,Friday


## Data Transformation: Geometry to Sub Wilayah Kota (SWK)

In [None]:
# Geometry feature will be translated to Sub Wilayah Kota (SWK) / Sub Area of City. To enable the process, I use the external data from the government site

url = "http://data.bandung.go.id/dataset/fbec5a0e-2efe-4d37-99ee-7a55d32beb20/resource/0a9aefc2-c849-4fb5-b393-b1687dbe8d70/download/3273-kota-bandung-level-kewilayahan.json"
gdf_swk = geopandas.read_file(url)
gdf_swk

Unnamed: 0,id_wilayah,nama_wilayah,geometry
0,5F142D26,Ujungberung,"POLYGON ((107.71456 -6.89047, 107.71457 -6.890..."
1,CC7B08F0,Tegallega,"POLYGON ((107.56940 -6.91107, 107.56964 -6.911..."
2,BB4C0B50,Kordon,"POLYGON ((107.66345 -6.93120, 107.66400 -6.931..."
3,6CE3653D,Karees,"POLYGON ((107.65666 -6.90218, 107.65700 -6.902..."
4,932659F2,Cibeunying,"POLYGON ((107.60411 -6.84220, 107.60407 -6.842..."
5,7D00F8E4,Bojonagara,"POLYGON ((107.59529 -6.84090, 107.59532 -6.840..."
6,0A471A31,Gedebage,"POLYGON ((107.68166 -6.93780, 107.68166 -6.937..."
7,B3D12B15,Arcamanik,"POLYGON ((107.67346 -6.89192, 107.67349 -6.891..."


In [None]:
# Standardize geometry feature so it will be processed

df['geometry'] = df['geometry'].apply(lambda x : (json.loads(x)['type'] == 'LineString' and LineString(json.loads(x)['coordinates']) or 
                                                  json.loads(x)['type'] == 'MultiLineString' and MultiLineString(json.loads(x)['coordinates'])))
df.head(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,avg_length,avg_delay,avg_speed_kmh,total_records,id,date,geometry,hour,day
0,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Van Deventer,3,176.0,106.0,4.92,1,11847,2019-05-31,"LINESTRING (107.614892 -6.917784, 107.615278 -...",16,Friday
1,2019-05-31 23:00:00,32.73,KOTA BANDUNG,HOS Cokroaminoto,3,307.0,144.0,6.13,1,14892,2019-05-31,"LINESTRING (107.597441 -6.901219, 107.597453 -...",23,Friday
2,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Sukaati,2,207.0,60.0,6.11,1,11803,2019-05-31,"LINESTRING (107.62308 -6.96115, 107.623086 -6....",16,Friday
3,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Lama,3,220.0,161.5,3.931,10,11941,2019-05-31,"(LINESTRING (107.5924 -6.948026, 107.592741 -6...",17,Friday
4,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Raya,3,1453.4839,475.6129,8.523226,31,11942,2019-05-31,"(LINESTRING (107.593294 -6.953205, 107.593288 ...",17,Friday


In [None]:
# Use centroid of every lines in geometry column to represent street

d = {'street': list(df['street']), 'geometry': list(df['geometry'])}
gdf = geopandas.GeoDataFrame(d, crs="EPSG:3395")
gdf['centroid'] = gdf.centroid
gdf.head(5)

Unnamed: 0,street,geometry,centroid
0,Van Deventer,"LINESTRING (107.61489 -6.91778, 107.61528 -6.9...",POINT (107.61555 -6.91734)
1,HOS Cokroaminoto,"LINESTRING (107.59744 -6.90122, 107.59745 -6.9...",POINT (107.59748 -6.90261)
2,Sukaati,"LINESTRING (107.62308 -6.96115, 107.62309 -6.9...",POINT (107.62304 -6.96022)
3,Cibaduyut Lama,"MULTILINESTRING ((107.59240 -6.94803, 107.5927...",POINT (107.59335 -6.94835)
4,Cibaduyut Raya,"MULTILINESTRING ((107.59329 -6.95320, 107.5932...",POINT (107.59361 -6.95480)


In [None]:
# Label every centroid with respective SWK

swk_list = list(gdf_swk['nama_wilayah'])
for i in range(len(swk_list)):
  gdf["wilayah_"+swk_list[i]] = gdf["centroid"].within(gdf_swk['geometry'][i]).astype('int')

def get_wilayah(row):
  for c in gdf.iloc[:, 3:].columns:
    if row[c]==1:
      return c[8:]

gdf["wilayah"] = gdf.iloc[:, 3:].apply(get_wilayah, axis=1)
gdf.head(5)

Unnamed: 0,street,geometry,centroid,wilayah_Ujungberung,wilayah_Tegallega,wilayah_Kordon,wilayah_Karees,wilayah_Cibeunying,wilayah_Bojonagara,wilayah_Gedebage,wilayah_Arcamanik,wilayah
0,Van Deventer,"LINESTRING (107.61489 -6.91778, 107.61528 -6.9...",POINT (107.61555 -6.91734),0,0,0,0,1,0,0,0,Cibeunying
1,HOS Cokroaminoto,"LINESTRING (107.59744 -6.90122, 107.59745 -6.9...",POINT (107.59748 -6.90261),0,0,0,0,0,1,0,0,Bojonagara
2,Sukaati,"LINESTRING (107.62308 -6.96115, 107.62309 -6.9...",POINT (107.62304 -6.96022),0,0,1,0,0,0,0,0,Kordon
3,Cibaduyut Lama,"MULTILINESTRING ((107.59240 -6.94803, 107.5927...",POINT (107.59335 -6.94835),0,1,0,0,0,0,0,0,Tegallega
4,Cibaduyut Raya,"MULTILINESTRING ((107.59329 -6.95320, 107.5932...",POINT (107.59361 -6.95480),0,1,0,0,0,0,0,0,Tegallega


In [None]:
# If the wilayah is not empty, then the street is validated as the part of Kota Bandung according to this site:
# http://data.bandung.go.id/dataset/data-spasial-sub-wilayah-kota-bandung/resource/0a9aefc2-c849-4fb5-b393-b1687dbe8d70?view_id=c0d828a7-0aae-4fe8-ae70-591c8ea1f1de

for col in gdf.iloc[:, 3:].columns:
  df[col] = gdf[col]

df = df.dropna() # excluding all streets outside Kota Bandung
df.head(5)

Unnamed: 0,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,level,avg_length,avg_delay,avg_speed_kmh,total_records,id,date,geometry,hour,day,wilayah_Ujungberung,wilayah_Tegallega,wilayah_Kordon,wilayah_Karees,wilayah_Cibeunying,wilayah_Bojonagara,wilayah_Gedebage,wilayah_Arcamanik,wilayah
0,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Van Deventer,3,176.0,106.0,4.92,1,11847,2019-05-31,"LINESTRING (107.614892 -6.917784, 107.615278 -...",16,Friday,0,0,0,0,1,0,0,0,Cibeunying
1,2019-05-31 23:00:00,32.73,KOTA BANDUNG,HOS Cokroaminoto,3,307.0,144.0,6.13,1,14892,2019-05-31,"LINESTRING (107.597441 -6.901219, 107.597453 -...",23,Friday,0,0,0,0,0,1,0,0,Bojonagara
2,2019-05-31 16:00:00,32.73,KOTA BANDUNG,Sukaati,2,207.0,60.0,6.11,1,11803,2019-05-31,"LINESTRING (107.62308 -6.96115, 107.623086 -6....",16,Friday,0,0,1,0,0,0,0,0,Kordon
3,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Lama,3,220.0,161.5,3.931,10,11941,2019-05-31,"(LINESTRING (107.5924 -6.948026, 107.592741 -6...",17,Friday,0,1,0,0,0,0,0,0,Tegallega
4,2019-05-31 17:00:00,32.73,KOTA BANDUNG,Cibaduyut Raya,3,1453.4839,475.6129,8.523226,31,11942,2019-05-31,"(LINESTRING (107.593294 -6.953205, 107.593288 ...",17,Friday,0,1,0,0,0,0,0,0,Tegallega


## What time has the highest traffic jam level?

This is the result of traffic jam observation counts by hour:

In [None]:
df[df['level'] == 5].groupby(['hour']).count()['level'].sort_values(ascending=False).head(5)

hour
20    11
17    10
23     9
21     9
19     9
Name: level, dtype: int64

In [None]:
df[df['level'] == 4].groupby(['hour']).count()['level'].sort_values(ascending=False).head(5)

hour
17    1686
16    1471
14    1350
11    1299
13    1224
Name: level, dtype: int64

In [None]:
df[df['level'] == 3].groupby(['hour']).count()['level'].sort_values(ascending=False).head(5)

hour
17    2980
16    2697
14    2496
13    2455
11    2325
Name: level, dtype: int64

In [None]:
df[df['level'] == 2].groupby(['hour']).count()['level'].sort_values(ascending=False).head(5)

hour
17    2644
16    2433
14    2247
13    2232
18    2190
Name: level, dtype: int64

In [None]:
df[df['level'] == 1].groupby(['hour']).count()['level'].sort_values(ascending=False).head(5)

hour
17    1016
18     957
16     939
19     931
14     857
Name: level, dtype: int64

### Insights
* 5 PM is on the top 5 for all traffic jam level, not recommended to drive around that time.
* There is the same group of hour in the traffic jam level 3 and 4, avoid driving at 2-5 PM and also 11 AM.

## What time and day has the highest traffic jam level?

This is the result of traffic jam observation counts by hour and day:

In [None]:
df[df['level'] == 5].groupby(['day', 'hour']).count()['level'].sort_values(ascending=False).head(5)

day       hour
Saturday  17      4
          20      3
Sunday    21      2
          18      2
Saturday  16      2
Name: level, dtype: int64

In [None]:
df[df['level'] == 4].groupby(['day', 'hour']).count()['level'].sort_values(ascending=False).head(5)

day       hour
Friday    17      333
Saturday  17      269
          16      264
Friday    16      259
Saturday  14      259
Name: level, dtype: int64

In [None]:
df[df['level'] == 3].groupby(['day', 'hour']).count()['level'].sort_values(ascending=False).head(5)

day       hour
Friday    17      520
Monday    17      470
Saturday  16      466
          17      452
          14      429
Name: level, dtype: int64

In [None]:
df[df['level'] == 2].groupby(['day', 'hour']).count()['level'].sort_values(ascending=False).head(5)

day       hour
Friday    17      424
Saturday  17      418
Monday    16      401
          17      397
Saturday  16      389
Name: level, dtype: int64

In [None]:
df[df['level'] == 1].groupby(['day', 'hour']).count()['level'].sort_values(ascending=False).head(5)

day       hour
Saturday  17      184
          19      162
          16      158
Friday    18      151
Saturday  18      151
Name: level, dtype: int64

### Insights

* Top 5 hours on traffic jam level 5 occured on the weekend at 4-6 PM and 8-9 PM, avoid driving at those times.
* Friday 5 PM has the highest occurences of traffic jam level 2-4.
* On the traffic jam level 1, highest occurences is on Saturday 4-7 PM, indicating that the traffic jam level tends to be decreased in the evening.

## Further Analysis: SWK

The previous analyses will be expanded to further analysis that includes SWK. This is the result:

In [None]:
df[df['level'] == 5].groupby(['day', 'hour', 'wilayah']).count()['level'].sort_values(ascending=False).head(5)

day       hour  wilayah   
Saturday  17    Cibeunying    3
Sunday    20    Cibeunying    2
          16    Cibeunying    2
Saturday  16    Cibeunying    2
          18    Cibeunying    2
Name: level, dtype: int64

In [None]:
df[df['level'] == 4].groupby(['day', 'hour', 'wilayah']).count()['level'].sort_values(ascending=False).head(5)

day       hour  wilayah   
Friday    17    Cibeunying    96
Saturday  14    Cibeunying    93
          20    Cibeunying    88
          19    Cibeunying    88
Friday    17    Bojonagara    87
Name: level, dtype: int64

In [None]:
df[df['level'] == 3].groupby(['day', 'hour', 'wilayah']).count()['level'].sort_values(ascending=False).head(5)

day       hour  wilayah   
Friday    17    Cibeunying    166
Saturday  14    Cibeunying    152
          19    Cibeunying    148
          13    Cibeunying    147
Monday    14    Cibeunying    147
Name: level, dtype: int64

In [None]:
df[df['level'] == 2].groupby(['day', 'hour', 'wilayah']).count()['level'].sort_values(ascending=False).head(5)

day       hour  wilayah   
Friday    17    Cibeunying    138
Saturday  19    Cibeunying    127
          14    Cibeunying    125
          20    Cibeunying    124
Monday    13    Cibeunying    122
Name: level, dtype: int64

In [None]:
df[df['level'] == 1].groupby(['day', 'hour', 'wilayah']).count()['level'].sort_values(ascending=False).head(5)

day       hour  wilayah   
Saturday  17    Cibeunying    60
Sunday    17    Cibeunying    52
Friday    18    Cibeunying    48
          17    Cibeunying    48
Saturday  13    Cibeunying    47
Name: level, dtype: int64

### Insights

* For all traffic jam levels, we can see that Cibeunying is on the top 5 except for the level 4.
* Jumat dan Sabtu pukul 5 sore merupakan waktu dengan kejadian terbanyak untuk semua tingkat kemacetan, hindari berkendara di daerah Cibeunying pada waktu-waktu tersebut.
* Friday 5 PM is has the highest occurences in traffic jam level 2-4 in Cibeunying, avoid driving in Cibeunying area at that time.

# Modelling

## Feature Selection

In [None]:
df.columns

Index(['time', 'kemendagri_kabupaten_kode', 'kemendagri_kabupaten_nama',
       'street', 'level', 'avg_length', 'avg_delay', 'avg_speed_kmh',
       'total_records', 'id', 'date', 'geometry', 'hour', 'day',
       'wilayah_Ujungberung', 'wilayah_Tegallega', 'wilayah_Kordon',
       'wilayah_Karees', 'wilayah_Cibeunying', 'wilayah_Bojonagara',
       'wilayah_Gedebage', 'wilayah_Arcamanik', 'wilayah'],
      dtype='object')

In [None]:
# Features that will not contribute the model will be dropped

df_model = df.drop(['kemendagri_kabupaten_kode', 'kemendagri_kabupaten_nama'], axis=1) # drop features with no variability
df_model = df_model.drop(['time', 'street', 'geometry', 'wilayah'], axis=1) # drop transformed features
df_model = df_model.drop(['total_records', 'id', 'date'], axis=1) # drop non-contributed features
df_model.head(5)

Unnamed: 0,level,avg_length,avg_delay,avg_speed_kmh,hour,day,wilayah_Ujungberung,wilayah_Tegallega,wilayah_Kordon,wilayah_Karees,wilayah_Cibeunying,wilayah_Bojonagara,wilayah_Gedebage,wilayah_Arcamanik
0,3,176.0,106.0,4.92,16,Friday,0,0,0,0,1,0,0,0
1,3,307.0,144.0,6.13,23,Friday,0,0,0,0,0,1,0,0
2,2,207.0,60.0,6.11,16,Friday,0,0,1,0,0,0,0,0
3,3,220.0,161.5,3.931,17,Friday,0,1,0,0,0,0,0,0
4,3,1453.4839,475.6129,8.523226,17,Friday,0,1,0,0,0,0,0,0


## Encode Categorical Features

In [None]:
# Hour and Day will be seen as categorical features, so that we need to encode those

def create_dummy_df(df, cat_cols, dummy_na):
    for col in cat_cols:
        try:
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', dummy_na=dummy_na)], axis=1)
        except:
            continue

    return df

cat_df = df_model.select_dtypes(include=['object'])
cat_cols_lst = cat_df.columns

df_model = create_dummy_df(df_model, cat_cols_lst, dummy_na=False)
df_model.head(5)

Unnamed: 0,level,avg_length,avg_delay,avg_speed_kmh,wilayah_Ujungberung,wilayah_Tegallega,wilayah_Kordon,wilayah_Karees,wilayah_Cibeunying,wilayah_Bojonagara,wilayah_Gedebage,wilayah_Arcamanik,hour_00,hour_01,hour_02,hour_03,hour_04,hour_05,hour_06,hour_07,hour_08,hour_09,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,3,176.0,106.0,4.92,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,3,307.0,144.0,6.13,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
2,2,207.0,60.0,6.11,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,3,220.0,161.5,3.931,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,3,1453.4839,475.6129,8.523226,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


## Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

y = df_model['level']
X = df_model.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Multiclass Classifier Model



In [None]:
# One vs Rest (OVR Classifier)

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate and Fit
ovr_log = OneVsRestClassifier(LogisticRegression()).fit(X_train, y_train)
ovr_dt = OneVsRestClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
ovr_rf = OneVsRestClassifier(RandomForestClassifier()).fit(X_train, y_train)

# Predict
y_test_pred_log = ovr_log.predict(X_test)
y_test_pred_dt = ovr_dt.predict(X_test)
y_test_pred_rf = ovr_rf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
# One vs One (OVO Classifier)

from sklearn.multiclass import OneVsOneClassifier

# Instantiate and Fit
ovo_log = OneVsOneClassifier(LogisticRegression()).fit(X_train, y_train)
ovo_dt = OneVsOneClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
ovo_rf = OneVsOneClassifier(RandomForestClassifier()).fit(X_train, y_train)

# Prediksi
y_test_pred_log_ovo = ovo_log.predict(X_test)
y_test_pred_dt_ovo = ovo_dt.predict(X_test)
y_test_pred_rf_ovo = ovo_rf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

## Model Selection

For model selection, I compare the R-squared and accuracy score. Model with the highest value for both metrics will be chosen.

In [None]:
from sklearn.metrics import r2_score, accuracy_score

# OVR

## Logistic OVR
print("R-Square Logisitc OVR: " + str(r2_score(y_test, y_test_pred_log)))
print("Accuracy Logisitc OVR: " + str(round(accuracy_score(y_test, y_test_pred_log)*100,2)) + "%")

## Decision Tree OVR
print("R-Square Decision Tree OVR: " + str(r2_score(y_test, y_test_pred_dt)))
print("Accuracy Decision Tree OVR: " + str(round(accuracy_score(y_test, y_test_pred_dt)*100,2)) + "%")

## Random Forest OVR
print("R-Square Random Forest OVR: " + str(r2_score(y_test, y_test_pred_rf)))
print("Accuracy Random Forest OVR: " + str(round(accuracy_score(y_test, y_test_pred_rf)*100,2)) + "%")

R-Square Logisitc OVR: 0.6016210820249503
Accuracy Logisitc OVR: 69.98%
R-Square Decision Tree OVR: -0.19255393546708643
Accuracy Decision Tree OVR: 70.43%
R-Square Random Forest OVR: 0.7023926602516204
Accuracy Random Forest OVR: 77.98%


In [None]:
# OVO

## Logistic OVO
print("R-Square Logisitc OVO: " + str(r2_score(y_test, y_test_pred_log_ovo)))
print("Accuracy Logisitc OVO: " + str(round(accuracy_score(y_test, y_test_pred_log_ovo)*100,2)) + "%")

## Decision Tree OVO
print("R-Square Decision Tree OVO: " + str(r2_score(y_test, y_test_pred_dt_ovo)))
print("Accuracy Decision Tree OVO: " + str(round(accuracy_score(y_test, y_test_pred_dt_ovo)*100,2)) + "%")

## Random Forest OVO
print("R-Square Random Forest OVO: " + str(r2_score(y_test, y_test_pred_rf_ovo)))
print("Accuracy Random Forest OVO: " + str(round(accuracy_score(y_test, y_test_pred_rf_ovo)*100,2)) + "%")

R-Square Logisitc OVO: 0.6696908459035691
Accuracy Logisitc OVO: 75.36%
R-Square Decision Tree OVO: 0.6466537715921394
Accuracy Decision Tree OVO: 75.64%
R-Square Random Forest OVO: 0.7075582971677457
Accuracy Random Forest OVO: 78.51%


As seen from the results, I decide to choose **Random Forest Classifier model with One vs One Classifier strategy** with the R-squared score of 0.707 and accuracy score of 78.51%.

## k-Fold Cross Validation on the Selected Model

In [None]:
# The validation is to check whether the model is overfit or not

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = ovo_rf, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 77.89 %
Standard Deviation: 0.48 %


With the accuracy difference of 0.62% from the selected model, we can conclude that the model is fitted well.



## Measure Feature Importance

In [None]:
# Permutation Feature will be implemented

from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
    return r2_score(y_train, ovo_rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(ovo_rf, X_train, y_train, r2)

In [None]:
perm_imp_rfpimp.sort_values

<bound method DataFrame.sort_values of                      Importance
Feature                        
avg_delay              1.140092
avg_speed_kmh          0.551702
avg_length             0.546626
wilayah_Cibeunying     0.072914
wilayah_Karees         0.072222
wilayah_Bojonagara     0.049840
wilayah_Tegallega      0.044995
day_Wednesday          0.027458
day_Saturday           0.027227
day_Friday             0.026997
day_Thursday           0.026305
day_Sunday             0.025843
day_Tuesday            0.021920
day_Monday             0.020767
hour_17                0.018921
wilayah_Kordon         0.016844
hour_11                0.016613
hour_18                0.015921
hour_14                0.015690
hour_20                0.015460
hour_15                0.013844
hour_16                0.013383
hour_12                0.012229
hour_13                0.011768
hour_10                0.011076
hour_19                0.008768
wilayah_Ujungberung    0.008537
wilayah_Arcamanik      0.008537
h

### Insights

* Average delay is the most affecting factor on traffic jam level, followed by average speed and average length.
* Vehicle presence on Cibeunying, Karees and Bojonagara become the top 3 of area effects
* Wednesday, Saturday and Friday become the top 3 days of the most affecting factor on traffic jam level.
* 5 PM becomes the most affecting hour factor on traffic jam level.

# Conclusions and Recommendations

**According to the given insights, bullet points we can take are as follows:**

* **Avoid driving at 5 PM especially on Friday and Saturday** because according to the Exploratory Data Analysism 5 PM is always on the top 5 of all traffic jam level. Other than that, 5 PM also becomes the most affecting hour feature compared to another hours.
* **Avoid driving in Cibeunying** at the time mentioned on the first point because traffic jam in Cibeunying has the most occurences for all level. Other than that, Cibeunying is the most affecting area feature compared to another areas.
* **Average delay is the most affecting factor on traffic jam level**, suggested to take alternative ways or wait to drive later if you have a quite long delay.