## **فراخوانی کتابخانه های مورد نیاز**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
import seaborn as sns
%matplotlib inline

### خواندن اطلاعات از فایل ###

In [None]:
data=pd.read_csv("used_cars.csv")
data

**تبدیل اطلاعات به صورت دیتا فریم**

In [None]:
df=pd.DataFrame(data)
df.head()

### ارزیابی کلی دادها ###

In [None]:
print(df.head())
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.duplicated().sum())
print(df.duplicated())
print(df.nunique())
print(df.info)
print(df.describe())
print(df.isnull().sum())
print(df.corr(numeric_only=True))
print(df['fuel'].value_counts())
print(df['engine'].value_counts())
print(df['max_power'].value_counts())
print(df['max_power'].value_counts())

### حذف رشته ها از مقادیر عددی ###

In [None]:
columns=['mileage','engine','max_power']
df[columns]
df[columns] = df[columns].replace(r'[^0-9]+', '', regex=True)
df[columns]

### تبدیل داده های رشته ای و چند حالته به داده های عددی(LABEL ENCODING) ###

In [None]:
def labelEncoder(data,columns):
   le=LabelEncoder()
   for col in columns:
    data[col]=le.fit_transform(data[col])+1
   return data


df=labelEncoder(df,['fuel','seller_type','transmission','owner'])
df


### حذف ستونهای بدون استفاده ###

In [None]:
df=df.drop(['name'],axis=1)
df=df.drop(['torque'],axis=1)
df

### تبدیل داده ها به تایپ عددی ###

In [None]:
df['mileage']=pd.to_numeric(df['mileage'])
df['engine']=pd.to_numeric(df['engine'])
df['max_power']=pd.to_numeric(df['max_power'])

### پر کردن مقادیر خالی از ستونها null ###

In [None]:
df['mileage'].fillna(df['mileage'].mean(), inplace=True)
df['engine'].fillna(df['engine'].mean(), inplace=True)
df['max_power'].fillna(df['max_power'].mean(), inplace=True)
# df['torque'].fillna(df['torque'].mean(), inplace=True)
df['seats'].fillna(df['seats'].mean(), inplace=True)
df['max_power']

### نرمال سازی داده ها ###

In [None]:
def min_max_scaler(data,columns):
   scaler=MinMaxScaler()
   data=scaler.fit_transform(data)
   data=pd.DataFrame(data)
   data.columns=columns
   return data


df=min_max_scaler(df,['year','selling_price','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats'])
df

### نمایش جدول همبستگی برای نشان دادن میزان همبستگی بین ستونها ###

In [None]:
plt.figure(figsize=(12,8))
corr=df.corr()
mask=np.triu(corr)
sns.heatmap(corr,square=True,annot=True,mask=mask,fmt='.2f')
plt.show()

### ساخت، آموزش و آزمون مدل به روش رگرسیون خطی تک متغیره ###

In [None]:
X=df[['engine']]
y=df['selling_price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
r2_1=r2_score(y_test,y_pred)
mse_1=mean_squared_error(y_test,y_pred)

print(f"Mean Squared Error:{mse_1}")
print(f"R_squared:{r2_1}")

### ساخت، آموزش و آزمون مدل به روش رگرسیون خطی چند متغیره ###

In [None]:
features=['engine','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats']
X=df[features]
y=df[['selling_price']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
r2_2=r2_score(y_test,y_pred)
mse_2=mean_squared_error(y_test,y_pred)

print(f"Mean Squared Error:{mse_2}")
print(f"R_squared:{r2_2}")

### ساخت، آموزش و آزمون مدل به روش رگرسیون چند جمله ای با درجه 2 ### 

In [None]:
features=['engine','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats']
X=df[features]
y=df[['selling_price']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)
model=LinearRegression()
pf=PolynomialFeatures(degree=2)
X_train_poly=pf.fit_transform(X_train)
X_test_poly=pf.fit_transform(X_test)
model.fit(X_train_poly,y_train)
RM2=model.score(X_test_poly,y_test)
print(RM2)

### ساخت، آموزش و آزمون مدل به روش رگرسیون چند جمله ای با درجه 3 ###

In [None]:
features=['engine','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats']
X=df[features]
y=df[['selling_price']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)
model=LinearRegression()
pf=PolynomialFeatures(degree=3)
X_train_poly=pf.fit_transform(X_train)
X_test_poly=pf.fit_transform(X_test)
model.fit(X_train_poly,y_train)
RM3=model.score(X_test_poly,y_test)
print(RM3)

### مدلسازی به روش pipeline  ###

In [None]:
features=['engine','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats']
X=df[features]
y=df[['selling_price']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)
pipeline=Pipeline(
    [
        ('scaler',StandardScaler()),
        ('pca',PCA(n_components=3)),
        ('regressor',LinearRegression())
    ])
pipeline.fit(X_train,y_train)
PIP=pipeline.score(X_test,y_test)
PIP

### مدلسازی به روش ridge ###

In [None]:
ridge=Ridge(alpha=0.2)
ridge.fit(X_train,y_train)
ridge.coef_
for i , col in enumerate(X_train.columns):
    print(f"Ridge model coefficients for {col}=>\t\t{ridge.coef_[0][i]}")
print(ridge.score(X_train,y_train))
RIDGE=ridge.score(X_test,y_test)
print(RIDGE)


### مدلسازی به روش lasso ###

In [None]:
lasso=Lasso(alpha=0.00001)
lasso.fit(X_train,y_train)
lasso.coef_
for i , col in enumerate(X_train.columns):
    print(f"Lasso model coefficients for {col}=>\t\t{lasso.coef_[i]}")

print(lasso.score(X_train,y_train))
LASSO=lasso.score(X_test,y_test)
print(LASSO)

### مدلسازی به روش  elasticNet ###

In [None]:
elastic=ElasticNet(alpha=0.0001,l1_ratio=0.0001)
elastic.fit(X_train,y_train)
print(elastic.score(X_train,y_train))
ELASTICNET=elastic.score(X_test,y_test)
print(ELASTICNET)

### پیدا کردن بهترین hyper parameter  به روش gridSearch ###

In [None]:
model=ElasticNet()
params={
    'alpha':[1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'l1_ratio':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
}

gridSearch=GridSearchCV(estimator=model,param_grid=params,scoring='r2',cv=5)
gridSearch.fit(X_train,y_train)
print(f"Best parameters:{gridSearch.best_params_}")
print(f"Best R2 score:{gridSearch.best_score_}")

### پیدا کردن بهترین hyper parameter  به روش randomized search ###

In [None]:
randomsearch=RandomizedSearchCV(estimator=model,param_distributions=params,scoring='r2',cv=5,n_iter=30,verbose=3)
randomsearch.fit(X_train,y_train)
print(f"Best parameters:{randomsearch.best_params_}")
print(f"Best R2 score:{randomsearch.best_score_}")

### پیدا کردن بهترین hyper parameter  به روش beyezed search ###

In [None]:
opt=BayesSearchCV(estimator=model,search_spaces=params,n_iter=20,cv=5,scoring='r2',verbose=3)
opt.fit(X_train,y_train)
print(f"Best parameters:{opt.best_params_}")
print(f"Best R2 score:{opt.best_score_}")

### پیدا کردن بهترین hyper parameter  به روش cross validation ###

In [None]:
alphas=[1e-4,1e-3,1e-2,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
l1_ratios=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
params_dict=[{'alpha':alpha,'l1_ratio':l1_ratio} for alpha in alphas for l1_ratio in l1_ratios ]

models=[ElasticNet(alpha=params['alpha'],l1_ratio=params['l1_ratio']) for params in params_dict]
CV_score=[cross_val_score(model,X_train,y_train,cv=5,scoring='r2').mean() for model in models]
best_model_index=np.argmax(CV_score)
best_model_params=params_dict[best_model_index]
print("Best parameter: ", best_model_index)
print("Best R2 score: ", CV_score[best_model_index])

### مقایسه نتایج بدست آمده از مدلهای مختلف از طریق رسم نمودار ###

In [None]:
A=['linear regretion1','linear regretion N','linear regretion multi2','linear regretion multi3','pipeline','Ridg','Lasso','ElasticNet']
B=[r2_1,r2_2,RM2,RM3,PIP,RIDGE,LASSO,ELASTICNET]
plt.figure(figsize=(10,8),tight_layout=True)
plt.bar(A,B)
plt.tick_params(axis='x', rotation=55)
plt.xlabel('MODELS')
plt.ylabel('SCORES')
plt.title('COMPAIRE OF MODELS')
plt.show()