In [1]:
import pandas as pd
import numpy as np
import random
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error



In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
#Root Mean Square Logrithmic Error
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    
    return np.sqrt(np.mean(calc))

In [4]:
def gini(list_of_values):
    sorted_list = sorted(list(list_of_values))
    height, area = 0, 0
  
    for value in sorted_list:
        height += value
        area += height - value / 2.
    
    fair_area = height * len(list_of_values) / 2
    return (fair_area - area) / fair_area


def normalized_gini(y_pred, y):
    normalized_gini = gini(y_pred)/gini(y)
    return normalized_gini
    

predicted_y = np.random.randint(100, size = 1000)
desired_y = np.random.randint(100, size = 1000)

print (normalized_gini(predicted_y, desired_y))

1.02732375298


In [5]:
# Load Training Data
df_train = pd.read_csv('train.csv', index_col = 'id')
print(df_train.shape)
df_train.head()

(595212, 58)


Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [6]:
# Load Test Data
df_test = pd.read_csv('test.csv', index_col = 'id')
print(df_test.shape)
df_test.head()

(892816, 57)


Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,8,1,0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2,5,1,0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1,3,0,0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1


In [7]:
# Split dataframe by rows into n roughly equal portions and return list of them.
def splitDf(df, n) :
    splitPoints = list(map( lambda x: int(x*len(df)/n), (list(range(1,n)))))     
    splits = list(np.split(df.sample(frac=1), splitPoints))
    return splits

In [8]:
# Take splits from splitDf, and return into test set (splits[index]) and training set (the rest)
def makeTrainAndTest(splits, index) :
   # index is zero based, so range 0-9 for 10 fold split
   test = splits[index]

   leftLst = splits[:index]
   rightLst = splits[index+1:]

   train = pd.concat(leftLst+rightLst)

   return train, test

In [9]:
n = 10
splits = splitDf(df_train, n)
trainTest = []
for i in range(0,n) :
     trainTest.append(makeTrainAndTest(splits, i))

In [10]:
predictions = pd.DataFrame()
prediction = pd.DataFrame()

for i in range(0,n):
    # Split the Train DataSet into X and y
    X_train = trainTest[i][0].drop('target', axis=1) #axis=1 denotes that we are referring to a column, not a row
    y_train = trainTest[i][0].target
    X_validation=trainTest[i][1].drop('target',axis=1)
    y_validation=trainTest[i][1].target
    
    # Train Linear Regression model
    
    model_xgb = XGBRegressor()
    model_xgb.fit(X_train, y_train)
    
    # Generate Metrics on Validation Set
    y_prediction = model_xgb.predict(X_validation)
    rmsle_val = rmsle(y_validation, y_prediction)
    rmse_val = mean_squared_error(y_validation, y_prediction)**0.5
    normalized_gini_val = normalized_gini(y_prediction, y_validation)

    print('Validation Metrics')
    print('Normalized gini:', normalized_gini_val)
    print('Root Mean Squared Logarithmic Error:', rmsle_val)
    print('Root Mean Squared Error:', rmse_val)
    
    # Predict on the Test Dataset
    X_test = df_test
    y_test = model_xgb.predict(X_test)
    
    # Build the Submission Dataset
    predictions['iteration']=str(i+1)
    predictions['id'] = X_test.index
    predictions['target'] = y_test.tolist()
    prediction = prediction.append(predictions)

    print(predictions.shape)
    print(predictions.head())
    
    print(prediction.shape)
    print(prediction.head())





Validation Metrics
Normalized gini: 0.252885842572
Root Mean Squared Logarithmic Error: 0.12925234796
Root Mean Squared Error: 0.185815174344
(892816, 3)
  iteration  id    target
0       NaN   0  0.025783
1       NaN   1  0.028892
2       NaN   2  0.027697
3       NaN   3  0.014697
4       NaN   4  0.035867
(892816, 3)
  iteration  id    target
0       NaN   0  0.025783
1       NaN   1  0.028892
2       NaN   2  0.027697
3       NaN   3  0.014697
4       NaN   4  0.035867
Validation Metrics
Normalized gini: 0.251465552671
Root Mean Squared Logarithmic Error: 0.128200333496
Root Mean Squared Error: 0.184241343223
(892816, 3)
  iteration  id    target
0         2   0  0.026991
1         2   1  0.030089
2         2   2  0.027472
3         2   3  0.014866
4         2   4  0.033602
(1785632, 3)
  iteration  id    target
0       NaN   0  0.025783
1       NaN   1  0.028892
2       NaN   2  0.027697
3       NaN   3  0.014697
4       NaN   4  0.035867
Validation Metrics
Normalized gini: 0.2553

In [11]:
pd.crosstab(index=prediction['iteration'],columns="count")

col_0,count
iteration,Unnamed: 1_level_1
10,892816
2,892816
3,892816
4,892816
5,892816
6,892816
7,892816
8,892816
9,892816


In [25]:
prediction.shape

(8928160, 3)

In [26]:
prediction_kfold=prediction[['id','target']]

In [27]:
prediction_kfold.shape

(8928160, 2)

In [28]:
prediction_kfold.head

<bound method NDFrame.head of              id    target
0             0  0.025783
1             1  0.028892
2             2  0.027697
3             3  0.014697
4             4  0.035867
5             5  0.038002
6             6  0.021037
7             8  0.045813
8            10  0.060317
9            11  0.057990
10           12  0.029049
11           14  0.025430
12           15  0.048573
13           18  0.047909
14           21  0.042418
15           23  0.023887
16           24  0.022978
17           25  0.048013
18           27  0.016977
19           29  0.050812
20           30  0.036132
21           31  0.045225
22           32  0.054212
23           33  0.014394
24           37  0.026882
25           38  0.031734
26           39  0.071955
27           40  0.038063
28           41  0.030783
29           42  0.020003
...         ...       ...
892786  1487982  0.017657
892787  1487984  0.040943
892788  1487985  0.032136
892789  1487986  0.033691
892790  1487987  0.035782
892791  

In [34]:
result=prediction_kfold.groupby(['id'])['target'].mean().reset_index()

In [35]:
result.shape

(892816, 2)

In [36]:
# Save Output
import time
submission_path = 'submission_' + str(time.time()) + '.csv'
result.to_csv(submission_path, index=False)