In [1]:
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_excel('C:/Users/nagpa/Desktop/traffic ml/trafficAnalysis/filtered_final_df3.xlsx')

In [3]:
df.head()

Unnamed: 0,VehicleSpeedAverage,VehicleSpeedVariance,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,mahala,traffic
0,6.06,115.840061,1.0548,38.039215,31,101,798.0,6.22,15,-0.5982,27.919697,17.937478,1
1,6.345,117.200816,0.9012,38.039215,31,101,797.5,6.27,15,-0.5244,26.458355,13.499702,1
2,6.585,117.559595,0.9331,90.588234,32,101,797.5,6.27,15,-0.5068,26.458355,17.52161,1
3,6.825,117.801222,0.9878,90.588234,32,101,726.0,8.63,15,-0.4882,26.004126,18.987033,1
4,7.065,117.925697,1.1316,92.941177,32,101,784.0,8.88,15,-0.6712,25.295498,17.767989,1


In [4]:
df['traffic'].value_counts()

1    7548
3    2447
2    2414
Name: traffic, dtype: int64

In [5]:
df = df.drop(columns='mahala')

In [6]:
# putting feature variables in X
X = df.drop('traffic',axis=1)
#putting target variable in Y
Y = df['traffic']

In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
smote = SMOTE(random_state=42)

In [10]:
X_smote, Y_smote = smote.fit_resample(X, Y)

In [11]:
# splitting data into train and test
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X_smote,Y_smote, train_size = 0.65,random_state=42)

In [None]:
X_train.shape, X_test.shape

## 1. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(random_state = 42, n_jobs = 1, max_depth = 5,
                                       n_estimators=100,oob_score=True,max_features=3)

In [None]:
%%time
classifier_rf.fit(X_train.values,Y_train.values)

In [None]:
classifier_rf.oob_score_

In [None]:
# hyper parameter tuning using grid search

rf = RandomForestClassifier(random_state=42,n_jobs=1)

params = {
    'max_depth' : [2,3,5,10,20],
    'min_samples_leaf' : [5,10,20,50,100,200],
    'n_estimators' : [10,25,30,50,100,200]
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=1, verbose=1, scoring = "accuracy")

In [None]:
%%time
grid_search.fit(X_train.values,Y_train.values)

In [None]:
grid_search.best_score_

In [None]:
results_rf = pd.DataFrame(grid_search.cv_results_)
scores_rf = results_rf[['mean_test_score', 'param_n_estimators',  'param_min_samples_leaf']]
scores_rf = scores_rf.sort_values(by = 'mean_test_score',ascending = False)

In [None]:
scores_rf = scores_rf.nlargest(30,'mean_test_score')

plt.figure(figsize = (15,8))
plt.bar(range(len(scores_rf)),scores_rf['mean_test_score'],align='center')

for i, score in enumerate(scores_rf['mean_test_score']):
    plt.text(i,score,f'{score:.3f}', ha='center', va='bottom')


plt.title('Grid Search Scores')
plt.xlabel('Hyperparameter Combination Index')
plt.ylabel('Mean Test Score')
plt.xticks(range(len(scores_rf)), [f"{row.param_n_estimators}, {row.param_min_samples_leaf}"
                                 for idx, row in scores_rf.iterrows()], rotation=90)

# Adjust layout to fit labels
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
best_hyperparams = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparams)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
Y_pred = rf_best.predict(X_test)

In [None]:
from sklearn import metrics 

In [None]:
print("ACCURACY OF THE MODEL:", metrics.accuracy_score(Y_test, Y_pred))

In [None]:
X_test

In [None]:
Y_test

In [None]:
rf_best.predict([[12.66,36,-0.2,38,44,102,790,5.5,16,-.1,20]])

In [None]:
rf_best.predict([[30,5,0,20,80,100,1500,10,25,0,5]])

In [None]:
rf_best.predict([[10,20,-1,70,100,120,2500,25,40,0.5,10]])

In [None]:
Y_smote

In [None]:
pd.Series(Y_smote).value_counts()

In [None]:
rf_best.predict([[3.3,24,-0.2,33,49,102,780,5,18,-0.07,22]])

In [None]:
rf_best.predict([[30,45,0.03,55,70,117,1900,20,31,-0.2,13]])

In [None]:
merged_test_data = pd.concat([X_test, Y_test], axis=1)

In [None]:
normal_congestion = merged_test_data[merged_test_data['traffic']==2]
low_congestion = merged_test_data[merged_test_data['traffic']==1]
high_congestion = merged_test_data[merged_test_data['traffic'] ==3]

In [None]:
rf_best.predict([[49,510,0.8,30,83,98,750,4.5,28,-0.6429,14]])

In [None]:
rf_best.predict([[22,26,-0.6,64,60,120,1800,22,24,0.5,17]])

## 2. Neural Networks

In [12]:
X_nn = np.array(X_smote)
Y_nn = np.array(Y_smote)

In [13]:
X_nn[:10], Y_nn[:10]

(array([[ 6.05999982e+00,  1.15840061e+02,  1.05480000e+00,
          3.80392151e+01,  3.10000000e+01,  1.01000000e+02,
          7.98000000e+02,  6.21999979e+00,  1.50000000e+01,
         -5.98200000e-01,  2.79196968e+01],
        [ 6.34499983e+00,  1.17200816e+02,  9.01200000e-01,
          3.80392151e+01,  3.10000000e+01,  1.01000000e+02,
          7.97500000e+02,  6.26999998e+00,  1.50000000e+01,
         -5.24400000e-01,  2.64583550e+01],
        [ 6.58499982e+00,  1.17559595e+02,  9.33100000e-01,
          9.05882340e+01,  3.20000000e+01,  1.01000000e+02,
          7.97500000e+02,  6.26999998e+00,  1.50000000e+01,
         -5.06800000e-01,  2.64583550e+01],
        [ 6.82499981e+00,  1.17801222e+02,  9.87800000e-01,
          9.05882340e+01,  3.20000000e+01,  1.01000000e+02,
          7.26000000e+02,  8.63000011e+00,  1.50000000e+01,
         -4.88200000e-01,  2.60041256e+01],
        [ 7.06499981e+00,  1.17925697e+02,  1.13160000e+00,
          9.29411774e+01,  3.20000000e+01,  

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
# pip install keras

In [16]:
# pip install tensorflow

In [17]:
from keras.utils.np_utils import to_categorical

In [18]:
l_encode = LabelEncoder()
l_encode.fit(Y)
Y_nn = l_encode.transform(Y_nn)
Y_nn = to_categorical(Y_nn)
Y_nn

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [19]:
Y.shape

(12409,)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_nn,Y_nn, test_size = 0.3, random_state = 0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((15850, 11), (6794, 11), (15850, 3), (6794, 3))

In [21]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)

In [22]:
import os
import multiprocessing

# Get the number of CPU cores
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 16


In [28]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [400, 1800, 1600],
    'batch_size': ['auto', 32, 64, 128]
}

grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, scoring='accuracy',n_jobs=1,verbose = 2)
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=400, solver=adam; total time=   4.2s
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=400, solver=adam; total time=   5.4s
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=400, solver=adam; total time=   6.3s
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=400, solver=adam; total time=   2.4s
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=400, solver=adam; total time=   3.0s
[CV] END activation=tanh, alpha=0.0001, batch_size=auto, hidden_layer_sizes=(50,), learning_rate=constant, max_iter=1800, solver=adam; total time=   4.3s
[CV] END activati

GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.001, 0.01],
                         'batch_size': ['auto', 32, 64, 128],
                         'hidden_layer_sizes': [(50,), (100,), (50, 50),
                                                (100, 100)],
                         'learning_rate': ['constant', 'adaptive'],
                         'max_iter': [400, 1800, 1600], 'solver': ['adam']},
             scoring='accuracy', verbose=2)

In [29]:
grid_search.best_score_

0.9495268138801262

In [30]:
grid_search.best_params_

{'activation': 'relu',
 'alpha': 0.001,
 'batch_size': 64,
 'hidden_layer_sizes': (100, 100),
 'learning_rate': 'adaptive',
 'max_iter': 1800,
 'solver': 'adam'}

In [32]:
mlp_best = grid_search.best_estimator_

In [33]:
import joblib

model_filename = 'mlp_best.pkl'
joblib.dump(mlp_best, model_filename)
print(f'Model saved to {model_filename}')

Model saved to mlp_best.pkl


In [36]:
Y_pred = mlp_best.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [37]:
accuracy_mlp = accuracy_score(Y_test, Y_pred)
print(f'Accuracy on testing data: {accuracy_mlp:.2f}')

Accuracy on testing data: 0.93


In [38]:
report = classification_report(Y_test, Y_pred)
print('Classification Report:')
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      2265
           1       0.91      0.94      0.93      2208
           2       0.96      0.98      0.97      2321

   micro avg       0.95      0.94      0.95      6794
   macro avg       0.95      0.94      0.95      6794
weighted avg       0.95      0.94      0.95      6794
 samples avg       0.93      0.94      0.94      6794



  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
mlp_best.predict([[22,26,-0.6,64,60,120,1800,22,24,0.5,17]])

array([[0, 1, 0]])

In [44]:
mlp_best.predict([[12.66,36,-0.2,38,44,102,790,5.5,16,-.1,20]])

array([[0, 0, 1]])