In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
from tqdm import tqdm
import helper as h

import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
import re

In [None]:
df_train = pd.read_hdf("../input/df.train.h5")
df_train["price_value"] = df_train['price_value'].map(h.parse_price)

df_test = pd.read_hdf("../input/df.test.h5")

print(df_train.shape, df_test.shape)

In [None]:
df = h.get_df(df_train, df_test)

In [None]:
cat_feats = [x for x in df.columns if "_cat" in x]
cat_feats

In [None]:
df["year_production"] = df["Rok produkcji"].astype(int)
df["year_production_ext"] = df.apply(lambda x: x["year_production"] if x["year_production"] != -1 else int(x["Year"]), axis=1)

df["version_years"] = df["Wersja"].map(lambda x: re.findall(r'(\d+)-(\d+)?', str(x)) )

df["version_year_from"] = df["version_years"].map(lambda x: x[0][0] if x else -1).astype(int)
df["version_year_to"] = df["version_years"].map(lambda x: x[0][1] if x and x[0][1] != "" else -1).astype(int)

In [None]:
feats = ["year_production_ext", "version_year_from", "version_year_to"] + cat_feats
h.check_log_model(df, feats, xgb.XGBRegressor(max_depth=5, n_estimators=50, learning_rate=0.3, random_state=0))

In [None]:
df["Pojemność skokowa"]

In [None]:
df["engine"] = df["Pojemność skokowa"].map(lambda x: str(x).split("cm3")[0].replace(" ", "")).astype("int")
df["engine"].hist()

In [None]:
def reset_outlires(df, feat, prc=99):
    cut_value = np.percentile(df[feat], prc)
    
    return df[feat].map(lambda x: x if x < cut_value else -1)

In [None]:
reset_outlires(df, "engine").hist();

df["engine_99"] = reset_outlires(df, "engine")

In [None]:
df["Moc"]  

In [None]:
df["horse_power"] = df["Moc"].map(lambda x: str(x).split("KM")[0].replace(" ", "")).astype("int")

In [None]:
df["horse_power"].hist()

In [None]:
reset_outlires(df, "horse_power").hist();

df["horse_power_99"] = reset_outlires(df, "horse_power")

In [None]:
feats = ["year_production_ext", "version_year_from", "version_year_to"] + cat_feats
feats += ["engine_99", "horse_power_99"]

h.check_log_model(df, feats, xgb.XGBRegressor(max_depth=10, n_estimators=50, learning_rate=0.3, random_state=0))

In [None]:
df['Zarejestrowany jako zabytek_cat'].value_counts()

In [None]:
cat_feats = ['Bezwypadkowy_cat',
 'Pojemność skokowa_cat',
 'Rodzaj paliwa_cat',
 'Skrzynia biegów_cat',
 'Pierwszy właściciel_cat',
 'Przebieg_cat',
 'Moc_cat',
 'Metalik_cat',
 'Typ_cat',
 'Kolor_cat',
 'Model pojazdu_cat',
 'Serwisowany w ASO_cat',
 'Napęd_cat',
 'Rok produkcji_cat',
 'Marka pojazdu_cat',
 'Perłowy_cat',
 'Uszkodzony_cat',
 'Akryl (niemetalizowany)_cat',
 'Tuning_cat',
 'Kierownica po prawej (Anglik)_cat',
 'Matowy_cat',
 'Zarejestrowany jako zabytek_cat',
 'abs_cat',
 'elek_szyby_cat'
            ]
# 'Alarm']
# 'czujnik_deszczu'

In [None]:
df.describe()

In [None]:
#str(df['features'][0])
# wydłubałem ABS z features
df['abs_cat'] = (df['features'].map(lambda x: 1 if 'ABS' in x else 0)).astype(int)

In [None]:
#(df['features'].map(lambda x: 1 if 'Elektryczne szyby przednie' in x else 0)).value_counts()
df['elek_szyby_cat'] = (df['features'].map(lambda x: 1 if 'Elektryczne szyby przednie' in x else 0)).astype(int)

In [None]:
 
#(df['features'].map(lambda x: 1 if 'Alarm' in x else 0)).astype(int)
df['Alarm'] = (df['features'].map(lambda x: 1 if 'Alarm' in x else 0)).astype(int)
df['Alarm'].value_counts()

In [None]:
%%time

feats = ["year_production_ext", "version_year_from", "version_year_to"] + cat_feats
feats += ["engine_99", "horse_power_99"]

#10, 50, 0.3 - 7681.626360661877, 197.16127358219487
#10, 75, 0.3 - 7638.219205992039, 201.46479441831838
#10, 60, 0.3 - 7666.922779800343, 197.31304650271315
#10, 40, 0.3 - 7731.081230752386, 187.79458476764736

h.check_log_model(df, feats, xgb.XGBRegressor(max_depth=10, n_estimators=1000, learning_rate=0.1, random_state=0)) 

In [None]:
feats = ["year_production_ext", "version_year_from", "version_year_to"] + cat_feats
feats += ["engine_99", "horse_power_99"]

df_train = df[ ~df["price_value"].isnull() ].copy()
df_test = df[ df["price_value"].isnull() ].copy()

X_train = df_train[feats]
y_train = df_train["price_value"]
y_log_train = np.log(y_train)

X_test = df_test[feats]

model = xgb.XGBRegressor(max_depth=10, n_estimators=70, learning_rate=0.29, random_state=0)
model.fit(X_train, y_log_train)
y_log_pred = model.predict(X_test)
y_pred = np.exp(y_log_pred)


df_test["price_value"] = y_pred
df_test[ ["id", "price_value"] ].to_csv("../output/xgb_log_year_production_engine_etc.csv", index=False)