In [None]:
'''
Regularization is process which helps us to get rid out of overfit ad underfit situation of 
our ml model 

regularization is often required whenever we perform polynomial regression because more create complex regression there is chance 
of overfitting of our model due that during training model accuracy will be good but while testing accuracy will compromised 
as we know machine learning model can not give 100% accuracy but whatever prediction required it should near to approximate 
values that's why it is required we take care while training our model 

regularization is process where we add some bias at training time of our data so this bias will not make model overfit 
in short if we have regression analysis then we have regularization to tune our bestfit line 

yhat=theta0+theta1x+e

1)squared 
2)absolute 


'''

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
df=pd.read_csv("data/Automobile_data.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
#data cleaning process 
#missing value treatment 
df["normalized-losses"].replace("?",np.nan,inplace=True)

In [None]:
df["normalized-losses"]

In [None]:
#now change datatype of our feature make it float from object 
df["normalized-losses"]=df["normalized-losses"].astype(float)

In [None]:
df["normalized-losses"].dtype

In [None]:
losses_mean=df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [None]:
losses_mean

In [None]:
df["normalized-losses"]

In [None]:
df["horsepower"].value_counts()

In [None]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"]=df["horsepower"].astype(float)
hoursepower_mean=df["horsepower"].mean()
df["horsepower"].fillna(hoursepower_mean,inplace=True)

In [None]:
df["horsepower"].dtype

In [None]:
df["horsepower"].value_counts()

In [None]:
df.info()

In [None]:
df_num=df.select_dtypes(["int64","float64"])
df_cat=df.select_dtypes("object")

In [None]:
df_num.info()

In [None]:
df_cat.info()

In [None]:
#machine learning model can not process alphanumeric values directly 
#so we have to convert our categorical values into numeric values 
#the process of converting categorical values into numeric value known as 
#label encoding 
from sklearn.preprocessing import LabelEncoder

In [None]:
df_cat["make"].value_counts()

In [None]:
for col in df_cat:
    le=LabelEncoder()
    df_cat[col]=le.fit_transform(df_cat[col])

In [None]:
df_cat.head()

In [None]:
df_new=pd.concat([df_num,df_cat],axis=1)

In [None]:
df_new.head()

# Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
x=df_new.drop("price",axis=1)
y=df_new["price"]

In [None]:
lr=LinearRegression()
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.03)


In [None]:
lr.fit(x_train,y_train)

In [None]:
print("Training data score => ",lr.score(x_train,y_train))
print("Testing data score => ",lr.score(x_test,y_test))

In [None]:
lr.coef_

In [None]:
from sklearn.linear_model import Lasso # L1 always take error into absolute 
from sklearn.linear_model import Ridge # L2 always take error into sqaure

In [None]:
#Ridge will add error 10^2 yhat=theta0+theta1x+e^2
r=Ridge(10)
r.fit(x_train,y_train)

In [None]:
r.coef_

In [None]:
predictor=x_train.columns

In [None]:
predictor 

In [None]:
coef=pd.Series(r.coef_,predictor.sort_values())
coef.plot(kind="bar",title="model coeffiecent")

In [None]:
#high hyperparametre uses absolute used for automatic feature selection 
# yhat=theta0+theta1x+|e|

l=Lasso(1000)
l.fit(x_train,y_train)
l.coef_

In [None]:
predictor=x_train.columns

In [None]:
predictor

In [None]:
coef=pd.Series(r.coef_,predictor.sort_values())
coef.plot(kind="bar",title="Model coeffiecent")

In [None]:
# how do i found which error or hyper parameter value is giving me good result 
#while testing my model 
'''
we can write loop and we can pass diff diff hyper parameter and we can check 
which hyperparameter if giving good test score 

'''


In [None]:
for i in range(50):
    r=Ridge(alpha=i)
    r.fit(x_train,y_train)
    print(i,":","Test score :",r.score(x_test,y_test),"Train score:",r.score(x_train,y_train))

In [None]:
for i in range(200,500,10):
    r=Ridge(alpha=i)
    r.fit(x_train,y_train)
    print(i,":","Test score :",r.score(x_test,y_test),"Train score:",r.score(x_train,y_train))

In [None]:
x_train

In [None]:
y_train

# Cross_val_score k-fold 

In [None]:
lasso=Lasso(alpha=1000)
ridge=Ridge(alpha=2)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
x

In [None]:
#Ridge L1
#Lasso L2 

In [None]:
y

In [None]:
l1_cross=cross_val_score(ridge,x,y,cv=4)

In [None]:
l1_cross

In [None]:
final_l1_cross_score=np.mean(l1_cross)

In [None]:
final_l1_cross_score

In [None]:
l2_cross=cross_val_score(lasso,x,y,cv=4)

In [None]:
l2_cross

In [None]:
final_l2_cross_score=np.mean(l2_cross) 

In [None]:
final_l2_cross_score

In [None]:
''' so i can come on conclusion by using lasso we are getting better accuracy 
as compared to ridge after 4 type of random sampling we come on conclusion 
on an average lasso ml algorithm model training is 
64% '''