In [1]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing

train=pd.read_csv('train.csv', index_col=0)
test=pd.read_csv('test.csv', index_col=0)
# label encode the categorical variables
for i in range(train.shape[1]):
    if train.iloc[:,i].dtype=='O':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train.iloc[:,i]) + list(test.iloc[:,i-1]))
        train.iloc[:,i] = lbl.transform(train.iloc[:,i])
        test.iloc[:,i-1] = lbl.transform(test.iloc[:,i-1])

predictors=train.columns.values.tolist()[1:]

In [2]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort true values on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

In [3]:
from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca.fit(train[predictors])
np.cumsum(pca.explained_variance_ratio_)

array([ 0.64052477,  0.70485142,  0.7564948 ,  0.79856712,  0.83778807,
        0.86766186,  0.89369798,  0.91860659,  0.94218318,  0.95708022,
        0.96476384,  0.97198989,  0.9779114 ,  0.98241782,  0.98656103,
        0.98997094])

In [11]:
train_pca=pd.DataFrame(pca.transform(train[predictors]))
train_pca['Hazard']=np.array(train.iloc[:,0])

In [18]:
train_pca.iloc[:,0:16].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,Hazard
0,22.011624,2.197549,11.270368,-0.560723,-3.8572,4.611379,-5.38925,-0.374177,0.68921,0.140596,0.122384,1.126142,1.305294,-0.038265,1.539655,-1.044174,1
1,-20.554059,-4.985823,3.950644,-4.083574,11.507768,5.285352,-7.959067,4.183133,-4.442317,-4.939957,0.019306,5.375269,1.012978,0.344369,0.571722,1.589332,4
2,-13.485705,-0.326685,-5.549856,7.557988,-3.988328,3.135895,2.052321,0.086258,8.273231,-4.989459,-2.28117,2.874825,2.547815,-0.141972,1.439112,1.932318,1
3,-13.891866,13.612374,-10.400625,4.098095,8.260687,2.465724,-4.608547,-0.757423,-1.700104,4.058913,-1.559664,-0.296086,1.265823,-0.130142,1.531511,1.515075,1
4,-17.539788,10.960154,-3.601321,-2.572278,11.733213,-0.222843,0.614663,3.460032,-4.524146,-0.049827,1.227205,2.752891,1.260994,0.553635,2.64066,1.285126,1


In [19]:
from sklearn.cross_validation import train_test_split
pca_train,pca_val=train_test_split(train_pca,test_size=0.33)
from sklearn.ensemble import RandomForestRegressor
alg=RandomForestRegressor(n_estimators=100, max_features = 'sqrt', oob_score=True, verbose = 1)
alg.fit(pca_train.iloc[:,0:16],pca_train['Hazard'])

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:   20.2s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   40.5s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=1, warm_start=False)

In [20]:
predictions=alg.predict(pca_val.iloc[:,0:16])

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.0s finished


In [21]:
Gini(pca_val['Hazard'],predictions)

0.2576399915221948

In [23]:
test_pca=pd.DataFrame(pca.transform(test[predictors]))
predictions=alg.predict(test_pca)
submission = pd.DataFrame({
        "Id": test.index,
        "Hazard": predictions
    })

submission.to_csv("submission_pca_rf.csv", index=False, columns=['Id','Hazard'])

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.7s finished
