# Data Preprocessing

### Importing the Dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("Traffic_most_frequent_road_dataset_after_prepocessed.csv")
df

Unnamed: 0,Start_Lat,Start_Lng,Time,Temperature(F),Visibility(mi),Humidity(%),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Accident
0,33.638980,-117.730350,433,58.0,10.0,84.0,Cloudy,False,False,False,False,True,False,False,False,False,False,False,False
1,47.549580,-122.313470,1030,52.0,10.0,61.0,Clear,False,False,False,False,False,False,False,False,False,False,False,True
2,33.686620,-117.766000,843,71.1,10.0,55.0,Partly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False
3,47.974910,-122.190790,1073,61.0,10.0,56.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False
4,35.363611,-119.347778,722,83.0,10.0,46.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26640,33.700624,-117.775189,41,51.0,10.0,63.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False
26641,42.858170,-123.246925,1238,47.0,0.5,100.0,Rain,False,False,False,False,False,False,False,False,False,False,False,False
26642,36.532448,-120.489860,872,79.0,10.0,42.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False
26643,47.656780,-122.322440,393,50.0,10.0,74.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False


In [3]:
X = df.iloc[:, :-1].values
X

array([[33.63898, -117.73035, 433, ..., False, False, False],
       [47.54958, -122.31347, 1030, ..., False, False, False],
       [33.68662, -117.766, 843, ..., False, False, False],
       ...,
       [36.532447999999995, -120.48986000000001, 872, ..., False, False,
        False],
       [47.65678, -122.32243999999999, 393, ..., False, False, False],
       [45.38139, -122.75278, 711, ..., False, False, False]],
      dtype=object)

In [4]:
Y = df.iloc[:, 18].values
Y

array([False,  True, False, ..., False, False, False])

### Taking Care of Missing Data

In [5]:
from sklearn.impute import SimpleImputer

s_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
s_imputer = s_imputer.fit(X[:, 0:6])
X[:, 0:6] = s_imputer.transform(X[:, 0:6])
X

array([[33.63898, -117.73035, 433.0, ..., False, False, False],
       [47.54958, -122.31347, 1030.0, ..., False, False, False],
       [33.68662, -117.766, 843.0, ..., False, False, False],
       ...,
       [36.532447999999995, -120.48986000000001, 872.0, ..., False,
        False, False],
       [47.65678, -122.32243999999999, 393.0, ..., False, False, False],
       [45.38139, -122.75278, 711.0, ..., False, False, False]],
      dtype=object)

### Encoding Categorical Data

In [6]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()

In [7]:
for i in range(6, 18):
    X[:, i] = labelencoder_X.fit_transform(X[:, i])
X

array([[33.63898, -117.73035, 433.0, ..., 0, 0, 0],
       [47.54958, -122.31347, 1030.0, ..., 0, 0, 0],
       [33.68662, -117.766, 843.0, ..., 0, 0, 0],
       ...,
       [36.532447999999995, -120.48986000000001, 872.0, ..., 0, 0, 0],
       [47.65678, -122.32243999999999, 393.0, ..., 0, 0, 0],
       [45.38139, -122.75278, 711.0, ..., 0, 0, 0]], dtype=object)

In [8]:
Y = labelencoder_X.fit_transform(Y)
Y

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

### Splitting the Dataset

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train

array([[-0.22624   , -0.33313379,  0.78413266, ..., -0.0237334 ,
         0.        , -0.06252207],
       [ 1.0328441 , -0.82708124,  0.42265523, ..., -0.0237334 ,
         0.        , -0.06252207],
       [-1.20919929,  1.41952153,  0.46158357, ..., -0.0237334 ,
         0.        , -0.06252207],
       ...,
       [-1.05050232,  1.04976223,  1.04550865, ..., -0.0237334 ,
         0.        , -0.06252207],
       [ 0.55833447, -1.09678903,  0.38650749, ..., -0.0237334 ,
         0.        , -0.06252207],
       [-0.2380859 , -0.3355861 , -0.74241433, ..., -0.0237334 ,
         0.        , -0.06252207]])

## Building Simple Linear Regression Model

### Fitting Simple Linear Regression to the Training Set

In [11]:
from sklearn.linear_model import LinearRegression
classifier = LinearRegression()
classifier.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Predicting the Test Set Result

In [12]:
Y_pred = classifier.predict(X_test)

### Accuracy

In [13]:
from sklearn.metrics import mean_absolute_error
mape = mean_absolute_error(Y_test, Y_pred) * 100
print("Mean Absolute Error")
print("Error:", mape)
print("Accuracy:", 100 - mape)

Mean Absolute Error
Error: 17.508909772201996
Accuracy: 82.491090227798


In [14]:
from sklearn.metrics import mean_squared_error
mape = mean_squared_error(Y_test, Y_pred) * 100
print("Mean Squared Error")
print("Error:", mape)
print("Accuracy:", 100 - mape)

Mean Squared Error
Error: 8.793755961822884
Accuracy: 91.20624403817712


In [15]:
from sklearn.metrics import mean_squared_root_error
from math import sqrt
mape = sqrt(mean_squared_error(Y_test, Y_pred)) * 100
print("Root Mean Squared Error")
print("Error:", mape)
print("Accuracy:", 100 - mape)

Root Mean Squared Error
Error: 29.654267756636454
Accuracy: 70.34573224336354
