In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.cross_validation import train_test_split as tts
from sklearn.model_selection import GridSearchCV as cv
import matplotlib.pyplot as plt



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)

    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

In [3]:
shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

In [4]:
X.head(5)

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0,1,0,1,0,0,0,0,1,0,...,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843
1,0,1,0,0,0,0,0,0,1,1,...,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496
2,0,1,0,0,1,0,0,0,1,1,...,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425
3,1,1,0,1,0,0,0,0,1,0,...,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642
4,0,1,0,1,0,0,0,0,1,1,...,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606


In [5]:
n_estimators = 100
learning_rate = 0.1
model = gbr(n_estimators=n_estimators, learning_rate=learning_rate, verbose = 1)
model = model.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           0.5003            2.88m
         2           0.4717            2.77m
         3           0.4484            2.73m
         4           0.4293            2.72m
         5           0.4131            2.70m
         6           0.3997            2.81m
         7           0.3882            2.76m
         8           0.3785            2.72m
         9           0.3701            2.65m
        10           0.3626            2.58m
        20           0.3185            2.25m
        30           0.2961            1.87m
        40           0.2812            1.57m
        50           0.2711            1.28m
        60           0.2640            1.01m
        70           0.2584           46.19s
        80           0.2542           30.80s
        90           0.2513           15.30s
       100           0.2490            0.00s


In [7]:
feature_feedback = pd.Series(model.feature_importances_, index = X.columns)
features = feature_feedback[feature_feedback > 0.001].index
len(features)

53

In [None]:
n_estimators = 800
learning_rate = 0.01
max_depth = 9 #7
max_features = 17 #11
min_samples_split = 6 #6
model2 = gbr(n_estimators=n_estimators, learning_rate=learning_rate,
             max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split,
             verbose = 1)
model2 = model2.fit(X[features],y)

      Iter       Train Loss   Remaining Time 
         1           0.5303           53.97m
         2           0.5254           54.45m
         3           0.5208           53.15m
         4           0.5161           52.63m
         5           0.5118           53.09m
         6           0.5073           52.55m
         7           0.5030           51.77m
         8           0.4987           51.37m
         9           0.4945           51.35m
        10           0.4904           51.72m
        20           0.4528           49.02m
        30           0.4217           48.74m
        40           0.3951           47.89m
        50           0.3728           46.84m
        60           0.3541           45.89m
        70           0.3378           45.15m
        80           0.3238           44.71m
        90           0.3121           43.93m


In [None]:
prediction = np.exp(model2.predict(X_test[features])) - shift
submission = pd.DataFrame()
submission['loss'] = prediction
submission['id'] = ids
submission.to_csv('sub_v.csv', index=False)