___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
#%matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
# df = pd.read_csv("getdummied_autoscout_F1382-Heagle.txt")
# df.head()
# df.to_csv('getdummied_autoscout.csv', index=None)

In [None]:
# df = pd.read_table('final_scout_dummy_F1329 - allen', delimiter = ',')
# df.head()

In [None]:
df = pd.read_csv("sample_file.csv")
df.head()

df = pd.get_dummies(df, drop_first =True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().any().any()

In [None]:
df.isnull().sum()*100

In [None]:
df.isnull().sum()*100 / df.shape[0]

In [None]:
# df.drop("nr_of_doors", axis=1, inplace=True)

In [None]:
# df.isnull().sum()*100

In [None]:
for i in df.columns:
    print(i)

In [None]:
df.replace([True, False], [1, 0], inplace = True)

## Data Pre-Processing

### Train | Test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = df.drop("price", axis =1)
y = df["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
for i in df.columns:
    print(df[i].value_counts())

## Modelling and Model Performance

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [None]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

In [None]:
def train_val(y_train, y_train_pred, y_test, y_pred):
    
    scores = {"train_set": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test_set": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
ln_model = LinearRegression()
ln_model.fit(X_train_scaled, y_train)
y_pred = ln_model.predict(X_test_scaled)
y_train_pred = ln_model.predict(X_train_scaled)
ln_r2 = r2_score(y_test, y_pred)
ln_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
train_val(y_train, y_train_pred, y_test, y_pred)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor(random_state=101)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_train_pred = rf_model.predict(X_train)
rf_r2 = r2_score(y_test, y_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
train_val(y_train, y_train_pred, y_test, y_pred)

## ADABOOST

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
ada_model = AdaBoostRegressor(random_state=101)
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)
y_train_pred = ada_model.predict(X_train)
ada_r2 = r2_score(y_test, y_pred)
ada_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
train_val(y_train, y_train_pred, y_test, y_pred)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb_model = GradientBoostingRegressor(random_state=101)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
y_train_pred = gb_model.predict(X_train)
gb_r2 = r2_score(y_test, y_pred)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
train_val(y_train, y_train_pred, y_test, y_pred)

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model = XGBRegressor(random_state=101)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
y_train_pred = xgb_model.predict(X_train)
xgb_r2 = r2_score(y_test, y_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
train_val(y_train, y_train_pred, y_test, y_pred)

## Model Comparing

In [None]:
compare = pd.DataFrame({"Model": ["LinReg", "RF", "ADABoost", "GBoosting", "XGBoost"],
                        "r2_score": [ln_r2, rf_r2, ada_r2, gb_r2, xgb_r2],
                        "RMSE": [ln_rmse, rf_rmse, ada_rmse, gb_rmse, xgb_rmse]})

def labels(ax):
    for p in ax.patches:
        width = p.get_width()                        # get bar length
        ax.text(width,                               # set the text at 1 unit right of the bar
                p.get_y() + p.get_height() / 2,      # get Y coordinate + X coordinate / 2
                '{:1.5f}'.format(width),             # set variable to display, 2 decimals
                ha = 'left',                         # horizontal alignment
                va = 'center')                       # vertical alignment
    
plt.figure(figsize=(14, 10))
plt.subplot(211)
compare = compare.sort_values(by="r2_score", ascending=False)
ax=sns.barplot(x="r2_score", y="Model", data=compare, palette="Blues_d")
labels(ax)

plt.subplot(212)
compare = compare.sort_values(by="RMSE", ascending=True)
ax=sns.barplot(x="RMSE", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.show()

# Finding The Best Modelling with Pycaret

In [None]:
from pycaret.regression import *

s = setup(data=df, target='price', session_id=123)

In [None]:
# Model training and selection

best_model = compare_models()

In [None]:
evaluate_model(best_model)

In [None]:
# xgboost = create_model("xgboost")

In [None]:
# tuned_xgboost = tune_model(xgboost)

In [None]:
# plot_model(tuned_xgboost)

In [None]:
# plot_model(tuned_xgboost, plot="error")

In [None]:
# final_model = finalize_model(tuned_xgboost)
# print(final_model)

___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___