<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Saransk_House_Predict_with_LGBMRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pycaret

In [None]:
from pycaret.regression import *

In [None]:
!pip install kaggle


from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  


!mkdir -p ~/.kaggle/
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d mrdaniilak/russia-real-estate-2021

In [None]:
! unzip russia-real-estate-2021.zip

In [None]:
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/content/input_data.csv', sep=';')

In [None]:
df.head()

In [None]:
saransk_df = df.drop(df[df.id_region != 13].index)
saransk_df.head()

In [None]:
saransk_df.index = np.arange(len(saransk_df))

In [None]:
saransk_df.head()

In [None]:
saransk_df.isnull().sum()

In [None]:
saransk_df = saransk_df.drop(['house_id', 'date', 'postal_code', 'street_id', 'id_region', 'house_id'], axis = 1)

saransk_df = saransk_df.dropna()

In [None]:
saransk_df.isnull().sum()

In [None]:
saransk_df.info()

In [None]:
saransk_df.describe()

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(saransk_df['price'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Price")
ax.set(title="Price distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
saransk_df = saransk_df.drop(saransk_df[saransk_df.price > 1000000000].index)

In [None]:
saransk_df = saransk_df.drop(saransk_df[saransk_df.price == 0].index)

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(saransk_df['price'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Price")
ax.set(title="Price distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
saransk_df.describe()

In [None]:
saransk_df = saransk_df.drop(saransk_df[saransk_df.kitchen_area < 0].index)
saransk_df.describe()

In [None]:
sns.heatmap(saransk_df.corr(), cmap = 'inferno', fmt = '.1f', annot = True)

In [None]:
saransk_df.head()

In [None]:
features_cols = ['level',	'levels',	'rooms', 'building_type',	'object_type']
proba = pd.get_dummies(saransk_df, columns=features_cols)

In [None]:
proba.head()

In [None]:
X = proba.drop('price', axis = 1)
y = proba.price

In [None]:
y = np.log(y)

In [None]:
rc = RobustScaler()

X = rc.fit_transform(X)

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
y_pred = lm.predict(X_train)

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
from math import sqrt

print('The Accuracy  on the training dataset is: ', lm.score(X_train, y_train) )
print('The Accuracy n2  on the training dataset is: ',r2_score(y_train,y_pred) )   

print("")
# Model Accuracy on testing dataset
print('The Accuracy  on the testing dataset is: ', lm.score(X_test, y_test) )

print("")
# The Root Mean Squared Error (RMSE)
print('The RMSE  on the training dataset is: ',sqrt(mean_squared_error(y_train,y_pred)))
print('The RMSE  on the testing dataset is: ',sqrt(mean_squared_error(y_test,lm.predict(X_test))))

print("")
# The Mean Absolute Error (MAE)
print('The MAE  on the training dataset is: ',mean_absolute_error(y_train,y_pred))
print('The MAE  on the testing dataset is: ',mean_absolute_error(y_test,lm.predict(X_test)))

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

In [None]:
lightgbm.fit(X_train, y_train)

In [None]:
y_pred_lr = lightgbm.predict(X_train)

In [None]:
print('The Accuracy  on the training dataset is: ', lightgbm.score(X_train, y_train) )
print('The Accuracy n2  on the training dataset is: ',r2_score(y_train,y_pred_lr) )   

print("")
# Model Accuracy on testing dataset
print('The Accuracy  on the testing dataset is: ', lightgbm.score(X_test, y_test) )

print("")
# The Root Mean Squared Error (RMSE)
print('The RMSE  on the training dataset is: ',sqrt(mean_squared_error(y_train,y_pred_lr)))
print('The RMSE  on the testing dataset is: ',sqrt(mean_squared_error(y_test,lm.predict(X_test))))

print("")
# The Mean Absolute Error (MAE)
print('The MAE  on the training dataset is: ',mean_absolute_error(y_train,y_pred_lr))
print('The MAE  on the testing dataset is: ',mean_absolute_error(y_test,lightgbm.predict(X_test)))

In [None]:
plt.figure(figsize=(15,10))

plt.scatter(y_train, y_pred_lr, c='green')
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', c='red', lw=3)
plt.xlabel('Actuals')
plt.ylabel('Predicted Values')
plt.title('Actuals Vs Predicted Values')

In [None]:
model = setup(data = saransk_df, 
             target = 'price',
             numeric_imputation = 'mean',
             ignore_features = ['geo_lat',	'geo_lon', 'object_type'],
             normalize = True,
             silent = True)

In [None]:
best = compare_models(n_select=2)

In [None]:
best

In [None]:
blender = blend_models(estimator_list=best)

In [None]:
tuned_lgb = tune_model(blender)

In [None]:
saransk_df.head()


In [None]:
test = [{'level' : 2,
        'levels' : 16,
        'rooms' : 3,
        'area' : 76,
        'kitchen_area' : 10,
        'building_type' : 4, }]

In [None]:
test = pd.DataFrame(test)

In [None]:
test

In [None]:
predictions = predict_model(tuned_lgb, data = test)

In [None]:
predictions

In [None]:
round(predictions.Label,0)

In [None]:
#pd.set_option('display.float_format', lambda x: '%.3f' % x)


predictions.style.format({'Label': '{0:.2f}'})

In [None]:
predictions