In [1]:
import pandas as pd
import numpy as np
import googletrans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OrdinalEncoder , LabelEncoder 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
%matplotlib inline
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
import xgboost as xgb

plt.style.use("dark_background")
pd.set_option('display.max_columns', None)

from utils.security import security_unique_values
import utils.data_prepare as dp 
import utils.learning as l 

In [2]:
train = pd.read_hdf('data/property.train.h5')

feature_engineering = Pipeline(steps= [
    ('get date', dp.DFTransform(lambda df: dp.date(df))),
    ('get metro name from breadcrumbs', dp.DFTransform(lambda df: dp.metro(df))),
    ('get unique values from Security:', dp.DFTransform(lambda df: dp.security(df))),
    ('make one hot encoding on security', dp.DFTransform(lambda df: dp.one_hot_encoding(df,'security_clean', security_unique_values)))
]) 

df = feature_engineering.fit_transform(train)
black_list = ['geo_block','owner','price']
feats = [elem  for elem in list(df.columns) if elem not in black_list]

converting_data_to_numbers = Pipeline(steps = [    
    ('fill_na', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = "nan")),
    ('factorize',OrdinalEncoder(handle_unknown='ignore')) 
]) 

data_transform = ColumnTransformer(
    transformers=[
        ('converting_data_to_numbers',converting_data_to_numbers,feats)
    ])

pipeline = Pipeline(steps=[('data_transform',data_transform )])

df = pipeline.fit_transform(df[feats])

In [3]:
df = pd.DataFrame(df, columns = feats ) 
df

Unnamed: 0,date,Building type:,Object type:,Ad type:,Commission agent:,Construction phase:,Housing class:,Elevator:,Bathroom type:,Balcony type:,Mortgage possible:,The view from the window:,Garbage chute:,Repair:,Fridge:,Phone:,Furniture:,Free layout:,It is possible to bargain:,Floor covering:,Room type:,Internet:,Kitchen furniture:,TV:,Washing machine:,Foundation type:,Overlap type:,Type of the building:,Playground:,Class:,metro,security_ohe,access control system_ohe,alarm system_ohe,checkpoint_ohe,closed area_ohe,concierge_ohe,fenced area_ohe,fire system_ohe,high-quality and safe playgrounds._ohe,intercom_ohe,nan_ohe,parking_ohe,provided_ohe,round the clock protected area_ohe,round the clock security_ohe,secure area_ohe,the area with landscaping and the use of small architectural forms_ohe,video surveillance_ohe
0,11.0,1.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,19.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2.0,4.0,2.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,6.0,3.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,3.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,7.0,3.0,2.0,2.0,1.0,7.0,4.0,1.0,1.0,2.0,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,3.0,1.0,3.0,1.0,7.0,4.0,0.0,1.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45689,1.0,5.0,2.0,2.0,1.0,0.0,1.0,1.0,1.0,3.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45690,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,3.0,3.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
45691,8.0,5.0,0.0,3.0,1.0,7.0,4.0,0.0,1.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45692,1.0,3.0,1.0,0.0,1.0,6.0,1.0,0.0,1.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,2.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def train_and_predict(model, X, y):
    model.fit(X, y)
    y_pred = model.predict(X)
    return mean_absolute_error(y, y_pred)

X = df.values
y = train['price'].values

In [5]:
model = xgb.XGBRegressor(n_estimators=70, learning_rate=0.2, max_depth=15, random_state=0)
#l.run_cv (model,X, y,target_log=True)

In [17]:
l.plot_learning_curve(model,X, y, target_log=True)

KeyboardInterrupt: 

### Uplad Colab

In [6]:
model.fit(df,np.log( train['price']).values) 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=15,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=70, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
test = pd.read_hdf('data/property.test.h5')
df_test = feature_engineering.transform(test)

df_test = pipeline.transform(df_test[feats])

y_pred = model.predict(df_test)
y_pred = np.exp(y_pred)
y_pred[y_pred < 0] = 0 #czasem może być wartość ujemna

test['price'] = y_pred
test[ ['id', 'price'] ].to_csv('output/model_Xgboost_all.csv', index=False) 