In [5]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt

from catboost import CatBoostRegressor

In [6]:
random_state = 42
n_splits = 5
magic_coef = 0.98

### Clustering by zipcode (not increased metrics)

In [8]:
zipcode_path = './data/zipcodes.csv'
def get_zipcodes(zipcode_path):
    zipcodes = pd.read_csv(zipcode_path).drop_duplicates(subset = "zipcode", keep='first').dropna()

    kms_per_radian = 6371.0088
    epsilon = 14 / kms_per_radian

    coords = zipcodes[['latitude', 'longitude']].values
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

    zipcodes['clusters'] = db.labels_
    
    zipcodes = zipcodes.drop(columns=['Unnamed: 0'])

    return zipcodes
get_zipcodes(zipcode_path)

Unnamed: 0,zipcode,city,latitude,longitude,clusters
0,19348,Berge bei Perleberg,53.237460,11.870770,0
1,85309,Pörnbach,48.616700,11.466700,0
2,24790,"Osterrönfeld Heidkrug, Gemeinde Osterrönfeld",54.275360,9.737535,0
3,98646,Hildburghausen,50.439501,10.723922,0
4,27336,"Frankenfeld, Aller",52.769510,9.430780,0
...,...,...,...,...,...
17355,94469,Deggendorf,48.833300,12.966700,0
17359,86643,"Rennertshofen, Oberbayern",48.750000,11.066700,0
17360,31097,Harbarnsen,51.990910,9.958610,0
17361,38324,Kissenbrück,52.110410,10.588690,0


In [16]:
def data_preparation(path, type='train', zipcode_path='./data/zipcodes.csv'):
    data = pd.read_csv(path)
    
#     Merged zipcodes to main dataframe
#     zipcodes = get_zipcodes(zipcode_path)
#     data = pd.merge(data, zipcodes, on=['zipcode'], how='left')
#     data['clusters'] = data['clusters'].fillna(-1).astype(int).astype(object)
    
    if type == 'train':
        data = data.drop(columns=['Unnamed: 0'])
    else:
        data.rename(columns={"Unnamed: 0": "Id"},inplace=True)
        
#   Drop useless columns 
    data = data.drop(columns=['zipcode'])
    
#   Fill categorical variables (additional category worked better than filling by mode) 
    data['model'].fillna('empty', inplace=True)
    data['gearbox'].fillna(data['gearbox'].mode()[0], inplace=True)
    data['type'].fillna('other', inplace=True)
    data['fuel'].fillna('empty',  inplace=True)

#     Noticed that without this normalization metrcis are better by 1% (tried different variations)     
#     def map_registration_year(year):
#         if 0 <= year <= 9:
#             return year + 2000
#         elif 9 < year < 100:
#             return year + 1900
#         else:
#             return year
#     data['registration_year'] = data['registration_year'].map(map_registration_year)

#   Fill numerical values
    data['damage'] = data['damage'].fillna(0.0)
    data['insurance_price'].fillna(0, inplace=True)
    
    data['engine_capacity'] = data['engine_capacity'].fillna(data.groupby('model')['engine_capacity'].transform('mean'))
    data['engine_capacity'].fillna(data['engine_capacity'].mean(), inplace=True)
    
    return data


def extract_features_target(dataframe, target='price'):
    features = dataframe.loc[:, dataframe.columns != target]
    target_column = dataframe[target]
    return features, target_column


def align_train_test(train_data, test_data):
    final_train, final_test = train_data.align(test_data, join='inner', axis=1)
    return final_train, final_test


def mape(Y_actual, Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

def mape_reverse_params(Y_Predicted, Y_actual):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

In [17]:
train_path = './data/train.csv'
test_path = './data/test_no_target.csv'

In [18]:
test_data = data_preparation(test_path, 'test')
train_data = data_preparation(train_path)

In [19]:
features, target = extract_features_target(train_data)

In [20]:
final_train, final_test = align_train_test(train_data, test_data)
final_train

Unnamed: 0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,insurance_price
0,2.000000,bus,2006,auto,140,c4,150000,gasoline,citroen,0.0,380.0
1,4.700000,other,2016,manual,0,vito,150000,empty,mercedes_benz,0.0,0.0
2,2.200000,limousine,2010,manual,175,mondeo,125000,diesel,ford,0.0,930.0
3,2.095862,other,2000,auto,265,andere,150000,gasoline,ford,0.0,680.0
4,2.202381,convertible,3,manual,109,2_reihe,150000,gasoline,peugeot,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
49995,1.400000,limousine,2006,manual,75,golf,90000,gasoline,volkswagen,0.0,500.0
49996,1.300000,small car,4,manual,60,fiesta,150000,gasoline,ford,0.0,0.0
49997,2.956522,limousine,1996,manual,150,5er,150000,gasoline,bmw,0.0,130.0
49998,4.400000,limousine,2007,manual,122,1er,100000,diesel,bmw,0.0,500.0


In [21]:
def train_and_predict_cat(train_X, val_X, train_y, val_y, test_results = [], test_df=final_test):
    train_y =  np.log1p(train_y) 
    
    gmb_model = CatBoostRegressor(
        metric_period=100, 
        loss_function='MAPE',
        eval_metric="MAPE",
        iterations=3000,
        one_hot_max_size=12,
#         od_type="Iter",
        depth=7,
        l2_leaf_reg=2,
        random_state=random_state
    )

    categorical_features_indices = np.where(train_X.dtypes == object)[0]
    gmb_model.fit(
        train_X, 
        train_y, 
        cat_features=categorical_features_indices, 
        eval_set=(val_X, np.log1p(val_y)),
        use_best_model=True
    )

    train_pred = np.expm1(gmb_model.predict(train_X))    
    val_pred = np.expm1(gmb_model.predict(val_X))
    
    train_mape = mape(np.expm1(train_y), train_pred)
    val_mape = mape(val_y, val_pred)
    
    test_predict = np.expm1(gmb_model.predict(test_df))
    test_results.append(test_predict)

    print(f"train mape: {train_mape}")    
    print(f"validation mape: {val_mape}")

In [22]:
# train_X, val_X, train_y, val_y = train_test_split(final_train, target, random_state=random_state)
# test_results = []
# train_and_predict_cat(train_X, val_X, train_y, val_y, test_results)

0:	learn: 0.1089459	test: 0.1086357	best: 0.1086357 (0)	total: 11.8ms	remaining: 35.3s
100:	learn: 0.0351977	test: 0.0354860	best: 0.0354860 (100)	total: 960ms	remaining: 27.6s
200:	learn: 0.0308253	test: 0.0312050	best: 0.0312050 (200)	total: 1.86s	remaining: 25.9s
300:	learn: 0.0296845	test: 0.0302518	best: 0.0302518 (300)	total: 2.77s	remaining: 24.9s
400:	learn: 0.0287932	test: 0.0295474	best: 0.0295474 (400)	total: 3.62s	remaining: 23.5s
500:	learn: 0.0281154	test: 0.0290487	best: 0.0290487 (500)	total: 4.49s	remaining: 22.4s
600:	learn: 0.0275965	test: 0.0287169	best: 0.0287169 (600)	total: 5.35s	remaining: 21.3s
700:	learn: 0.0271746	test: 0.0285032	best: 0.0285032 (700)	total: 6.21s	remaining: 20.4s
800:	learn: 0.0268275	test: 0.0283404	best: 0.0283404 (800)	total: 7.06s	remaining: 19.4s
900:	learn: 0.0265428	test: 0.0282240	best: 0.0282240 (900)	total: 7.91s	remaining: 18.4s
1000:	learn: 0.0262648	test: 0.0281192	best: 0.0281192 (1000)	total: 8.77s	remaining: 17.5s
1100:	learn

In [23]:
kf = KFold(n_splits=5, random_state=random_state, shuffle=True)

test_results = []
train_X, val_X, train_y, val_y = train_test_split(final_train, target, random_state=random_state)

for train_index, test_index in kf.split(train_X):
    X_train, X_valid = train_X.iloc[train_index], train_X.iloc[test_index]
    y_train, y_valid = train_y.iloc[train_index], train_y.iloc[test_index]
    
    train_and_predict_cat(X_train, X_valid, y_train, y_valid, test_results)


0:	learn: 0.1091038	test: 0.1078700	best: 0.1078700 (0)	total: 10.5ms	remaining: 31.5s
100:	learn: 0.0352061	test: 0.0352698	best: 0.0352698 (100)	total: 795ms	remaining: 22.8s
200:	learn: 0.0309178	test: 0.0310844	best: 0.0310844 (200)	total: 1.53s	remaining: 21.3s
300:	learn: 0.0296680	test: 0.0300560	best: 0.0300560 (300)	total: 2.27s	remaining: 20.4s
400:	learn: 0.0287254	test: 0.0293252	best: 0.0293252 (400)	total: 3.08s	remaining: 20s
500:	learn: 0.0280255	test: 0.0288482	best: 0.0288482 (500)	total: 3.88s	remaining: 19.3s
600:	learn: 0.0274608	test: 0.0285271	best: 0.0285271 (600)	total: 4.64s	remaining: 18.5s
700:	learn: 0.0270371	test: 0.0283385	best: 0.0283385 (700)	total: 5.39s	remaining: 17.7s
800:	learn: 0.0266840	test: 0.0281870	best: 0.0281870 (800)	total: 6.19s	remaining: 17s
900:	learn: 0.0263704	test: 0.0280648	best: 0.0280648 (900)	total: 6.96s	remaining: 16.2s
1000:	learn: 0.0260632	test: 0.0279623	best: 0.0279623 (1000)	total: 7.77s	remaining: 15.5s
1100:	learn: 0.

2600:	learn: 0.0234767	test: 0.0277531	best: 0.0277531 (2600)	total: 19.6s	remaining: 3s
2700:	learn: 0.0233914	test: 0.0277442	best: 0.0277442 (2700)	total: 20.3s	remaining: 2.25s
2800:	learn: 0.0233045	test: 0.0277404	best: 0.0277404 (2800)	total: 21s	remaining: 1.49s
2900:	learn: 0.0232251	test: 0.0277392	best: 0.0277392 (2900)	total: 21.7s	remaining: 740ms
2999:	learn: 0.0231488	test: 0.0277332	best: 0.0277332 (2999)	total: 22.4s	remaining: 0us

bestTest = 0.0277332042
bestIteration = 2999

train mape: 19.137989870808294
validation mape: 22.877232169885904
0:	learn: 0.1087549	test: 0.1094187	best: 0.1094187 (0)	total: 9.53ms	remaining: 28.6s
100:	learn: 0.0351829	test: 0.0360087	best: 0.0360087 (100)	total: 851ms	remaining: 24.4s
200:	learn: 0.0307252	test: 0.0314803	best: 0.0314803 (200)	total: 1.55s	remaining: 21.6s
300:	learn: 0.0294786	test: 0.0304375	best: 0.0304375 (300)	total: 2.36s	remaining: 21.2s
400:	learn: 0.0285786	test: 0.0298011	best: 0.0298011 (400)	total: 3.35s	rem

In [27]:
# Noticed that my model often mistakes in bigger side and decreasing prediction by coefficient 

# val_mape = mape(val_y, np.min(test_results, axis=0) * magic_coef)
# print(f"min validation mape: {val_mape}")

# magic_coef_for_mean = 0.94
# val_mape = mape(val_y, np.mean(test_results, axis=0) * magic_coef_for_mean)
# print(f"mean validation mape: {val_mape}")

In [1880]:
test_predict = np.min(test_results, axis=0) * magic_coef
test_predict

array([10800.67997069, 10218.01565019,  6949.71101759, ...,
        1620.51340691, 10750.86142479,   825.56392547])

In [1881]:
result = pd.DataFrame(data={'Id': test_data['Id'], 'Predicted': test_predict }) 
result

Unnamed: 0,Id,Predicted
0,60314,10800.679971
1,12566,10218.015650
2,17760,6949.711018
3,8876,1026.616976
4,80392,7050.308188
...,...,...
49995,93878,1194.064170
49996,99783,3222.862237
49997,57399,1620.513407
49998,97106,10750.861425


In [1882]:
result.to_csv('predicted.csv', index=False)