# SGD( Schotastic Gradient Descent) Model

## Importing Libraries

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

## Importing and Cleaning Data

In [83]:
data = pd.read_csv("data.csv")
# data.dtypes
data = data.drop(columns= ['origin', 'name'])
data["horsepower"] = pd.to_numeric(data["horsepower"], errors='coerce')
data["horsepower"].fillna(data['horsepower'].median(), inplace=True)
data

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,Kilometer_per_liter
0,8,307.0,130.0,3504,12.0,1970,7.652587
1,8,350.0,165.0,3693,11.5,1970,6.377156
2,8,318.0,150.0,3436,11.0,1970,7.652587
3,8,304.0,150.0,3433,12.0,1970,6.802299
4,8,302.0,140.0,3449,10.5,1970,7.227443
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,1982,11.478880
394,4,97.0,52.0,2130,24.6,1982,18.706323
395,4,135.0,84.0,2295,11.6,1982,13.604599
396,4,120.0,79.0,2625,18.6,1982,11.904024


## Profiling Data

In [84]:
from pandas_profiling import ProfileReport
prof = ProfileReport(data)
prof.to_file(output_file='output.html')

Summarize dataset: 100%|██████████| 52/52 [00:02<00:00, 24.24it/s, Completed]                                       
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 850.25it/s]


In [85]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

## Splitting the dataset into the Training set and Test Set

In [86]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

## Feature Scaling

In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ct = ColumnTransformer([('scaler', StandardScaler(), [0,1,2,3,4,5]),], remainder='passthrough')

x_train = ct.fit_transform(x_train)
x_test = ct.transform(x_test)

## Training the Model

In [88]:
from sklearn.linear_model import SGDRegressor
regressor = SGDRegressor()
regressor.fit(x_train, y_train)

## Predicting the test set results

In [89]:
y_pred = regressor.predict(x_test)

## Evaluating the Model Performance

In [97]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
cv_score = cross_val_score(regressor, x_train, y_train, cv=10) * 100
print('Accuracy: {}%'.format(round((r2_score(y_test, y_pred)*100),2)))
print("CV mean score: {}%".format(round(cv_score.mean(),2)))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))

Accuracy: 81.36%
CV mean score: 79.2%
Mean Squared Error: 2.1380560455133306
Mean Absolute Error: 1.169113058406857


## Residual Plot

In [91]:
residuals = y_test - y_pred
sizes = np.random.randint(300, size=len(y_pred))
colors = np.random.randint(1000, size=len(y_pred))
plt.scatter(y_pred, residuals, c=colors, cmap='Pastel1',edgecolors='gray', s=sizes)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()