**@author:** Dogan Can Demirbilek

**@brief:** Just trying different ML algorithms to see the result without making deep analysis and preprocessing.

* Clusters can be used and different models can be tried for each cluster since datasets contains luxury, middle-class and cheap cars all together
* Models can be tuned by using hyperparamater optimization methods
* Deeper cleaning of data can be performed
* Different models like deep learning models can be tried

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.stats import norm, skew
from scipy import stats

In [None]:
data = pd.read_excel('../data/data_merged_filtered.xlsx',index_col=0)
data.head()

In [None]:
data.columns

In [None]:
silinecekler = ['il','model','motor_hacmi_cc','cekis','garanti','plaka_uyruk','kimden',
                'takas','durumu','seri','renk','submodel1','submodel2']
for i in silinecekler:
    del data[i]

In [None]:
data.info()

In [None]:
pd.options.display.float_format = "{:.2f}".format
data.describe()

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(data['fiyat'] , fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(data['fiyat'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')

#Get also the QQ-plot
fig = plt.figure(figsize=(12,8))
res = stats.probplot(data['fiyat'], plot=plt)
plt.show()

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
Log_transformed_price = np.log1p(data["fiyat"])

#Check the new distribution
plt.figure(figsize =(12,8))
sns.distplot(Log_transformed_price , fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(data['fiyat'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')

#Get also the QQ-plot
fig = plt.figure(figsize=(12,8))
res = stats.probplot(Log_transformed_price, plot=plt)
plt.show()

In [None]:
#log transform skewed numeric features:
numeric_feats = data.dtypes[data.dtypes != "object"].index

skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data[skewed_feats] = np.log1p(data[skewed_feats])

In [None]:
data = pd.get_dummies(data)

In [None]:
data.head()

In [None]:
y = data.fiyat
features = [i for i in data.columns if not i == 'fiyat']
X = data[features]
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33,random_state = 324)

In [None]:
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, ElasticNet , Lasso
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_squared_error
#sklearn.metrics.mean_squared_error(y_true, y_pred, sample_weight=None, multioutput=’uniform_average’)

def mse_cv(y_pred_model):
    mse = mean_squared_error(y_true = np.exp(y_test),y_pred = np.exp(y_pred_model)) 
    return(mse)

In [None]:
xgboost_Model = XGBRegressor()
xgboost_Model.fit(X_train,y_train)

In [None]:
y_pred_xgboost = xgboost_Model.predict(X_test)
print('Mean Squared Error of this model is : ',mse_cv(y_pred_xgboost))
print('R2 of this model is : ', xgboost_Model.score(X_train,y_train))

In [None]:
RandomForest_Model = RandomForestRegressor()
RandomForest_Model.fit(X_train,y_train)

In [None]:
y_pred_randomforest = RandomForest_Model.predict(X_test)
print('Mean Squared Error of this model is : ',mse_cv(y_pred_randomforest))
print('R2 of this model is : ', RandomForest_Model.score(X_train,y_train))

In [None]:
alphas = [0.0000005,0.00001,0.0001, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
for alpha in alphas:
    Ridge_Model = Ridge(alpha = alpha)
    Ridge_Model.fit(X_train,y_train)
    y_pred_ridge = Ridge_Model.predict(X_test)
    print(mse_cv(y_pred_ridge))

In [None]:
Best_Ridge = Ridge(alpha = 1)
Best_Ridge.fit(X_train,y_train)
y_pred_ridge = Best_Ridge.predict(X_test)
print('Mean Squared Error of this model is : ' ,mse_cv(y_pred_ridge))
print('R2 of this model is : ',Best_Ridge.score(X_train,y_train))

In [None]:
lass_model = Lasso(alpha = 1)
lass_model.fit(X_train,y_train)
y_pred_lasso = lass_model.predict(X_test)
print('Mean Squared Error of this model is : ' ,mse_cv(y_pred_lasso))
print('R2 of this model is : ',lass_model.score(X_train,y_train))

In [None]:
Elastic_Model = ElasticNet(alpha = 0.0001,max_iter=5000)
Elastic_Model.fit(X_train,y_train)
y_pred_elastic = Elastic_Model.predict(X_test)
print(mse_cv(y_pred_elastic))

In [None]:
y_pred_xgboost_series = pd.Series(np.exp(y_pred_xgboost),name='XGBoost')
y_pred_randomforest_series = pd.Series(np.exp(y_pred_randomforest),name='Random Forest')
y_pred_ridge_series = pd.Series(np.exp(y_pred_ridge),name='Ridge')
y_pred_lasso_series = pd.Series(np.exp(y_pred_lasso),name='Lasso')
y_pred_elastic_series = pd.Series(np.exp(y_pred_elastic),name='Elastic')
y_real = pd.Series(np.exp(y_test).values,name='Y_Real')

In [None]:
prediction_list = [y_real,y_pred_xgboost_series,y_pred_randomforest_series,y_pred_ridge_series,
                   y_pred_lasso_series,y_pred_elastic_series]

In [None]:
all_predictions = pd.concat(prediction_list,axis=1)

In [None]:
all_predictions.info()

In [None]:
all_predictions.head()

In [None]:
data.columns

In [None]:
Marka = 'marka_' + input("Lütfen arabanın markasını giriniz: ")
Yıl = int(input("Lütfen arabanın üretim yılını giriniz: "))
Yakıt = 'yakit_' + input("Lütfen arabanın yakıt tipini giriniz: ")
Vites = 'vites_' + input("Lütfen arabanın vites türünü giriniz: ");
Kilometre =int(input("Lütfen arabanın kilometresini giriniz: "))
Kasa_Tipi = 'kasa_tipi' + input("Lütfen arabanın kasa tipini giriniz: ")
Motor_Gücü = int(input("Lütfen arabaın motor gücünü giriniz: "))


user_input = [Marka,Yıl,Yakıt,Vites,Kilometre,Kasa_Tipi, Motor_Gücü]

In [None]:
empthy_df = pd.DataFrame(data = np.zeros((1,57)),columns=features)
empthy_df
for i in user_input:
    for j in empthy_df.columns:
        if i == j:
            empthy_df[j] = 1

            empthy_df['yil'] = Yıl
empthy_df['km'] = Kilometre
empthy_df['motor_gucu_hp'] = Motor_Gücü
skewed_feat1 = ['km']
skewed_feat2 = ['motor_gucu_hp']
empthy_df[skewed_feat1] = np.log(empthy_df[skewed_feat1])
empthy_df[skewed_feat2] = np.log(empthy_df[skewed_feat2])
prediction = xgboost_Model.predict(empthy_df)
np.expm1(prediction)