# SGD(Stochastic Gradient Descent) Model

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

## Importing and Cleaning Data

In [None]:
data = pd.read_csv("data.csv")
# data.dtypes
data = data.drop(columns= ['origin', 'name'])
data["horsepower"] = pd.to_numeric(data["horsepower"], errors='coerce')
# data["horsepower"].fillna(data['horsepower'].mean(), inplace=True)
# data["horsepower"].fillna(data['horsepower'].median(), inplace=True)
# data["horsepower"] = data["horsepower"].fillna(data['horsepower'].mode())
# data

## Using Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data["horsepower"] = imp_mode.fit_transform(data[["horsepower"]])
data

## Profiling Data

In [None]:
from pandas_profiling import ProfileReport
prof = ProfileReport(data)
# prof.to_file(output_file='output.html')

In [None]:
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

## Splitting the dataset into the Training set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

## Feature Scaling

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
ct = ColumnTransformer([('scaler', StandardScaler(), [0,1,2,3,4,5]),], remainder='passthrough')

x_train = ct.fit_transform(x_train)
x_test = ct.transform(x_test)

## Training the Model

In [None]:
from sklearn.linear_model import SGDRegressor
regressor = SGDRegressor()
regressor.fit(x_train, y_train)

## Predicting the test set results

In [None]:
y_pred = regressor.predict(x_test)

## Evaluating the Model Performance

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
cv_score = cross_val_score(regressor, x_train, y_train, cv=10) * 100
print('Accuracy: {}%'.format(round((r2_score(y_test, y_pred)*100),2)))
print("CV mean score: {}%".format(round(cv_score.mean(),2)))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))

## Residual Plot

In [None]:
residuals = y_test - y_pred
sizes = np.random.randint(300, size=len(y_pred))
colors = np.random.randint(1000, size=len(y_pred))
plt.scatter(y_pred, residuals, c=colors, cmap='Pastel1',edgecolors='gray', s=sizes)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()

Using mean we get accuracy = 81.22%
Using median we get accuracy = 81.39%
Using mode we get accuracy = 81.07%