In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sklearn

In [2]:
car_crashes_df = sns.load_dataset('car_crashes')
car_crashes_df.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


In [3]:
car_crashes_df.isna().sum()

total             0
speeding          0
alcohol           0
not_distracted    0
no_previous       0
ins_premium       0
ins_losses        0
abbrev            0
dtype: int64

In [4]:
car_crashes_df.duplicated().unique()

array([False])

In [5]:
car_crashes_df.describe()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,15.790196,4.998196,4.886784,13.573176,14.004882,886.957647,134.493137
std,4.122002,2.017747,1.729133,4.508977,3.764672,178.296285,24.835922
min,5.9,1.792,1.593,1.76,5.9,641.96,82.75
25%,12.75,3.7665,3.894,10.478,11.348,768.43,114.645
50%,15.6,4.608,4.554,13.857,13.775,858.97,136.05
75%,18.5,6.439,5.604,16.14,16.755,1007.945,151.87
max,23.9,9.45,10.038,23.661,21.28,1301.52,194.78


In [6]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
def metrics(y_test,y_pred):
    MAE = mean_absolute_error(y_test,y_pred)
    MSE = mean_squared_error(y_test,y_pred)
    RSME = np.sqrt(MSE)

    MAE_percentage = MAE*100/y.mean()
    RSME_percentage = RSME*100/y.mean()

    print(f'MAE: {MAE}, percentege: {MAE_percentage}\n RSME: {RSME}, percentage: {RSME_percentage}')

In [7]:
X = car_crashes_df.drop(['total','abbrev'],axis=1)
y = car_crashes_df['total']

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train = pd.DataFrame(scaled_X_train,columns=X_train.columns)
scaled_X_test = pd.DataFrame(scaled_X_test,columns=X_test.columns)

# Support vector machine

In [10]:
from sklearn.svm import SVR
svr_model = SVR()

In [11]:
from sklearn.model_selection import GridSearchCV
params = {'kernel': ['linear','rbf','poly'],'C': [0.1,1,10,100],'gamma':[0.01,0.1,0.2,0.4]}
gridCV = GridSearchCV(svr_model,params,scoring='neg_mean_squared_error')
gridCV.fit(scaled_X_train,y_train)

In [12]:
gridCV.best_estimator_

In [13]:
svr_model = SVR(kernel='linear',C=100,gamma=0.01)
svr_model.fit(scaled_X_train,y_train)
y_pred = svr_model.predict(scaled_X_test)

In [14]:
metrics(y_test,y_pred)

MAE: 0.6503029715622386, percentege: 4.118397063165799
 RSME: 0.8075991509095792, percentage: 5.1145606229217115


# Ridge regression

In [15]:
from sklearn.linear_model import RidgeCV,Ridge
ridgeCV_model = RidgeCV(alphas=np.linspace(0.01,10,1000),scoring='neg_mean_absolute_error')
ridgeCV_model.fit(scaled_X_train,y_train)
ridgeCV_model.alpha_

0.06999999999999999

In [16]:
ridge_model = Ridge(alpha=0.7)
ridge_model.fit(scaled_X_train,y_train)
y_pred = ridge_model.predict(scaled_X_test)

metrics(y_test,y_pred)

MAE: 0.5993165721981789, percentege: 3.79549797368771
 RSME: 0.7535473363263565, percentage: 4.772248125250738


# Lasso regression

In [17]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
alpha = [10,100,250,500,1000,2000]
params = {'alpha': alpha}
lasso_model = Lasso()
grid = GridSearchCV(lasso_model,params)
grid.fit(scaled_X_train,y_train)

In [18]:
grid.best_estimator_

In [19]:
lasso_model = Lasso(alpha=10)
lasso_model.fit(scaled_X_train,y_train)
y_pred = lasso_model.predict(scaled_X_test)

metrics(y_test,y_pred)

MAE: 2.5620454545454545, percentege: 16.22554553356739
 RSME: 2.9849138673426645, percentage: 18.90358962305674


# KNN

In [20]:
from sklearn.neighbors import KNeighborsRegressor

params = {'n_neighbors':[1,2,3,4,5,10,20]}
knn_model = KNeighborsRegressor()
gridCV = GridSearchCV(knn_model,params,scoring='neg_mean_squared_error')
gridCV.fit(scaled_X_train,y_train)
gridCV.best_params_

{'n_neighbors': 4}

In [21]:
knn_model = KNeighborsRegressor(n_neighbors=4)
knn_model.fit(X_train,y_train)
y_pred = knn_model.predict(scaled_X_test)
metrics(y_test,y_pred)

MAE: 2.684090909090909, percentege: 16.99846471670637
 RSME: 3.111224181747582, percentage: 19.703518349574903
