In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
import zipcode
import scipy
import seaborn as sns
%matplotlib inline

In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    """
    Use of this metric is not recommended; for illustration only. 
    See other regression metrics on sklearn docs:
      http://scikit-learn.org/stable/modules/classes.html#regression-metrics
    Use like any other metric
    >>> y_true = [3, -0.5, 2, 7]; y_pred = [2.5, -0.3, 2, 8]
    >>> mean_absolute_percentage_error(y_true, y_pred)
    Out[]: 24.791666666666668
    """

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def mape(preds, dtrain): 
    
    labels = dtrain.get_label()
    
    return 'mape', np.mean(np.abs((labels - preds) / labels)) * 100
    #return 'mape', 25

In [4]:
from sklearn.metrics import make_scorer

In [5]:
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [6]:
train = pd.read_csv('Fr_train.csv', sep=';')
test = pd.read_csv('Fr_test.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
#test[test.marque.isin(list(set(test.marque.unique()).difference(set(train.marque.unique()))))].marque='no_in_train'

In [8]:
#train[train.marque.isin(list(set(train.marque.unique()).difference(set(test.marque.unique()))))].marque = 'not_in_test'

In [9]:
target = train.prime_tot_ttc
train = train.drop(['prime_tot_ttc'],axis=1)

In [10]:
data = pd.concat([train, test])

In [11]:
data = data.reset_index()

In [12]:
data['region_3'] = data.codepostal.apply(lambda x: str(x)[:3])
data['region_2'] = data.codepostal.apply(lambda x: str(x)[:2])
data['exp'] = - data.annee_permis + 2016

In [13]:
post_city  = []
post_state = []
post_wage = []
post_population = []
post_tax = []
post_decom = []

for i in data.codepostal.values:
    
    if len(str(i)) == 4:
        i = '0' + str(i)
    
    try:
        state = zipcode.isequal(str(i)).state
        post_state.append(state)
    except:
        post_state.append('nothing')
        
    try:
        city = zipcode.isequal(str(i)).city
        post_city.append(city)
    except:
        post_city.append('nothing')
        
    try:
        wage = int(zipcode.isequal(str(i)).wages)
        post_wage.append(wage)
    except:
        post_wage.append(0)
        
    try:
        pop = int(zipcode.isequal(str(i)).population)
        post_population.append(pop)
    except:
        post_population.append(0)
        
    try:
        tax = int(zipcode.isequal(str(i)).tax_returns_filed)
        post_tax.append(tax)
    except:
        post_tax.append(0)
        
    try:
        decom = int(zipcode.isequal(str(i)).decommisioned)
        post_decom.append(decom)
    except:
        post_decom.append(0)

In [14]:
#data['city'] = post_city
data['state'] = post_state
data['wage'] = post_wage
data['population'] = post_population
data['tax'] = post_tax

In [15]:
data.head()

Unnamed: 0,index,id,annee_naissance,annee_permis,marque,puis_fiscale,anc_veh,codepostal,energie_veh,kmage_annuel,...,var20,var21,var22,region_3,region_2,exp,state,wage,population,tax
0,0,1,1986.0,2006.0,RENAULT,4,1,1034,gpl,2924,...,0,0,1,103,10,10.0,MA,45589776,1787,985
1,1,2,1986.0,2006.0,RENAULT,8,2,1034,gpl,11580,...,0,0,1,103,10,10.0,MA,45589776,1787,985
2,2,3,1982.0,2001.0,RENAULT,7,2,1034,gpl,7149,...,0,0,1,103,10,15.0,MA,45589776,1787,985
3,3,4,1987.0,2006.0,DACIA,5,2,1034,gpl,6526,...,0,0,1,103,10,10.0,MA,45589776,1787,985
4,4,5,1994.0,2013.0,CITROEN,4,2,1034,gpl,2872,...,1,1,4,103,10,3.0,MA,45589776,1787,985


In [16]:
ids = data.id
data = data.drop(['codepostal', 'index','id'], axis=1)

In [17]:
data = data.fillna(data.median())

In [18]:
data_dummy = pd.get_dummies(data)

In [19]:
data_dummy.head(2)

Unnamed: 0,annee_naissance,annee_permis,puis_fiscale,anc_veh,kmage_annuel,crm,var1,var2,var3,var4,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WI,state_WV,state_WY,state_nothing
0,1986.0,2006.0,4,1,2924,68,10.0,1,372,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1986.0,2006.0,8,2,11580,50,10.0,1,372,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train = data_dummy[:len(train)]
X_test = data_dummy[len(train):]

In [21]:
print X_train.shape, X_test.shape
print len(train), len(test)

(300000, 975) (30000, 975)
300000 30000


In [22]:
#test_dummy = test_dummy.drop(list(set(test_dummy.columns).difference(set(train_dummy.columns))),axis=1)
#train_dummy = train_dummy.drop(list(set(train_dummy.columns).difference(set(test_dummy.columns))),axis=1)

In [23]:
#columns = []
#for i in train.select_dtypes(exclude=['object']).columns:
#    for j in test.select_dtypes(exclude=['object']).columns:
#        if (i==j):
#            columns.append(i)

In [31]:
def run_xgb(train, test, target, eta=0.02, max_depth=3, random_state=10):
    
    #eta = 0.1
    #max_depth = 3
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "reg:linear",
        "booster" : "gbtree",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "alpha": 0.25,
        "seed": random_state,
    }
    num_boost_round = 1000
    early_stopping_rounds = 50
    test_size = 0.15

    X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    #dtrain = xgb.DMatrix(X_train[features], label = y_train)
    #dvalid = xgb.DMatrix(X_valid[features], label = y_valid)
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    
    gbm = xgb.XGBRegressor(params)
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True, feval=mape)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid), ntree_limit=gbm.best_iteration)
    score = mean_absolute_percentage_error(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [32]:
#for depth in range(2,10,1):
#    pred, score = run_xgb(X_train, X_test, target, max_depth = depth)
#    print score

In [33]:
pred, score = run_xgb(X_train, X_test, target, max_depth = 15)

XGBoost params. ETA: 0.02, MAX_DEPTH: 15, SUBSAMPLE: 0.7, COLSAMPLE_BY_TREE: 0.7
('Length train:', 255000)
('Length valid:', 45000)


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-mape:97.805661	eval-mape:97.803938
[1]	train-mape:95.812923	eval-mape:95.809460
[2]	train-mape:93.855554	eval-mape:93.852282
[3]	train-mape:91.939229	eval-mape:91.935509
[4]	train-mape:90.065181	eval-mape:90.059930
[5]	train-mape:88.226205	eval-mape:88.220918
[6]	train-mape:86.425859	eval-mape:86.418933
[7]	train-mape:84.662664	eval-mape:84.654427
[8]	train-mape:82.935810	eval-mape:82.926416
[9]	train-mape:81.241751	eval-mape:81.231666
[10]	train-mape:79.504871	eval-mape:79.487240
[11]	train-mape:77.877539	eval-mape:77.859712
[12]	train-mape:76.284993	eval-mape:76.265615
[13]	train-mape:74.661171	eval-mape:74.631947
[14]	train-mape:73.136389	eval-mape:73.105580
[15]	train-mape:71.639484	eval-mape:71.606249
[16]	train-mape:70.112121	eval-mape:70.070052
[17]	train-mape:68.679035	eval-mape:68.634707
[18]	train-mape:67.271382	eval-mape:67.225033
[19]	train-mape:65.894842	eval-mape:65.846348
[20]	train-mape:64.535677	eval-

Validating...
Predict test set...
Training time: 5.91 minutes


In [34]:
result = pd.DataFrame([ids.values[len(train):],pred])
result = result.transpose()
cols = ['ID','COTIS']
result.columns = cols
result.ID = result.ID.astype(int)

In [35]:
#result.COTIS[result.region=='10'] = result.COTIS[result.region=='10'] * 1.1

In [36]:
result.head()

Unnamed: 0,ID,COTIS
0,300001,311.697113
1,300002,228.062668
2,300003,264.065125
3,300004,336.995605
4,300005,321.909851


In [37]:
result[['ID','COTIS']].to_csv('result_' + str(score) + '.csv',sep=';',index_label=False, index=False)