In [None]:
import numpy as np
import pandas as pd 

In [None]:
train =pd.read_csv("dataset/train.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio 

In [None]:
pio.renderers.default = "notebook"

In [None]:
def outliers(df, dt):
    sorted(df[dt])
    Q1 = df[dt].quantile(0.25)
    Q3 = df[dt].quantile(0.75)
    IQR = Q3 - Q1
    print("Column:", dt)
    upper_val = (Q3 + (1.5 * IQR))
    lower_val = Q1 - (1.5 * IQR)
    count = len(df[(df[dt] > upper_val) | (df[dt] < lower_val)])
    df.replace(df[(df[dt] > upper_val) | (df[dt] < lower_val)].index, df[dt].mean(), inplace=True)
    print("Count of Item Replace:", count)
    print("Outliers ratio:", count / len(df[dt]))

In [None]:
def null_values(df):
    null_value = df.isnull().sum().sort_values(ascending=False)
    percent_1 = df.isnull().sum() / df.isnull().count() * 100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([null_value, percent_2], axis=1, keys=['Total', '%'])
    print(missing_data)

In [None]:
null_values(train)

In [None]:
train = train.ffill()
train = train.bfill()

In [None]:
train = train.drop(columns=["Id", "PoolQC", "Fence","Alley","MiscFeature"], axis=1)

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [None]:
def cat_to_num(df,dt):
    enc = OrdinalEncoder()
    df[[dt]] = enc.fit_transform(df[[dt]])
    df[dt]=df[dt].astype("int64")
    outliers(df,dt)

In [None]:
for item in (train.loc[:, train.dtypes == np.object].columns):
    cat_to_num(train,item)

In [None]:
train = train.astype("int64")

## Corrilation

In [None]:
corr = train.corr()

In [None]:
corr["SalePrice"]

In [None]:
columns_toremove = list(corr.index[ corr["SalePrice"]<0])

In [None]:
train = train.drop(columns= columns_toremove, axis=1)

In [None]:
trace = go.Heatmap(z=corr.values,
                  x=corr.index.values,
                  y=corr.columns.values)
traces=[trace]
layout = go.Layout(title=" Correlation" ,width = 1050, height = 900,
    autosize = False)
fig_go = go.Figure(data=traces, layout=layout)
fig_go.show()

In [None]:
train.shape

In [None]:
train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
numeric_coloumns = list(train.columns)
pipeline = ColumnTransformer([
    ("Standred", StandardScaler(), numeric_coloumns,)
])
scaled_data = pd.DataFrame(pipeline.fit_transform(train), columns=list(train.columns))

In [None]:
label = train["SalePrice"]
scaled_data = scaled_data.drop("SalePrice", axis=1)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split

In [None]:
train_x, test_x, train_y, test_y = train_test_split(scaled_data, label, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
regressor.fit(train_x, train_y)

In [None]:
y_pred = regressor.predict(test_x)

In [None]:
y_pred

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(test_y, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(test_y, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_y, y_pred)))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

estimator = RandomForestRegressor(max_features='auto', oob_score=True, random_state=42)
param_grid = { 
        "n_estimators"      : [200, 400, 700],
        "min_samples_split" : [2,4,8,10],
            }

grid = GridSearchCV(estimator, param_grid, cv=10, n_jobs=6)

grid.fit(train_x, train_y)

In [None]:
print(" Results from Grid Search ")
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
model = RandomForestRegressor(n_estimators=1000, oob_score=True, random_state=42)
model.fit(train_x, train_y)

In [None]:
model_pred = model.predict(test_x)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(test_y, model_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(test_y, model_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_y, model_pred)))

## Test

In [None]:
test = pd.read_csv("dataset/test.csv")

In [None]:
test.info()

In [None]:
test = test.drop(columns=["PoolQC", "Fence","Alley","MiscFeature"], axis=1)

In [None]:
test = test.drop(columns=columns_toremove, axis=1)

In [None]:
test=test.ffill()
test=test.bfill()

In [None]:
for item in (test.loc[:, test.dtypes == np.object].columns):
    cat_to_num(test,item)

In [None]:
test = test.astype("int64")

In [None]:
test_id = test["Id"].astype("int32")
test = test.drop("Id",axis=1)

In [None]:
pred = model.predict(test)

In [None]:
pred