In [7]:
## Importing Requried Libraries
import numpy as np
import subprocess
import pandas as pd

from IPython.display import Image

from collections import Counter

from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss, accuracy_score

# classifiers
from sklearn.ensemble import GradientBoostingClassifier

# reproducibility
seed = 104

In [8]:
#### LOADING DATA ####

### TRAIN DATA
train_data = pd.read_csv('train.csv', na_values='-1')
                        
## Filling the missing data NAN with median of the column
train_data_nato_median = pd.DataFrame()
for column in train_data.columns:
    train_data_nato_median[column] = train_data[column].fillna(train_data[column].median())

train_data = train_data_nato_median.copy()

### TEST DATA
test_data = pd.read_csv('test.csv', na_values='-1')
## Filling the missing data NAN with mean of the column
test_data_nato_median = pd.DataFrame()
for column in test_data.columns:
    test_data_nato_median[column] = test_data[column].fillna(test_data[column].median())
    
test_data = test_data_nato_median.copy()
test_data_id = test_data.pop('id')

In [9]:

column_names = train_data.columns
categorical_column = column_names[column_names.str[10] == 'c']

## Changing categorical columns to category data type
def int_to_categorical(data):
    """ 
    changing columns to catgorical data type
    """
    for column in categorical_column:
        data[column] =  data[column].astype('category')

In [10]:
## Creating list of train and test data and converting columns of interest to categorical type
datas = [train_data,test_data]

for data in datas:
    int_to_categorical(data)

test_data.dtypes

ps_ind_01            int64
ps_ind_02_cat     category
ps_ind_03            int64
ps_ind_04_cat     category
ps_ind_05_cat     category
ps_ind_06_bin        int64
ps_ind_07_bin        int64
ps_ind_08_bin        int64
ps_ind_09_bin        int64
ps_ind_10_bin        int64
ps_ind_11_bin        int64
ps_ind_12_bin        int64
ps_ind_13_bin        int64
ps_ind_14            int64
ps_ind_15            int64
ps_ind_16_bin        int64
ps_ind_17_bin        int64
ps_ind_18_bin        int64
ps_reg_01          float64
ps_reg_02          float64
ps_reg_03          float64
ps_car_01_cat     category
ps_car_02_cat     category
ps_car_03_cat     category
ps_car_04_cat     category
ps_car_05_cat     category
ps_car_06_cat     category
ps_car_07_cat     category
ps_car_08_cat     category
ps_car_09_cat     category
ps_car_10_cat     category
ps_car_11_cat     category
ps_car_11          float64
ps_car_12          float64
ps_car_13          float64
ps_car_14          float64
ps_car_15          float64
p

In [11]:
### CREATING DUMMIES FOR CATEGORICAL VARIABLES  
for column in categorical_column:
        dummies = pd.get_dummies(train_data[column],prefix=column)
        train_data = pd.concat([train_data,dummies],axis =1)
        train_data.drop([column],axis=1,inplace= True)


for column in categorical_column:
        dummies = pd.get_dummies(test_data[column],prefix=column)
        test_data = pd.concat([test_data,dummies],axis =1)
        test_data.drop([column],axis=1,inplace= True)

print(train_data.shape)
print(test_data.shape)

(595212, 220)
(892816, 218)


In [12]:
#Define covariates in X and dependent variable in y
X = train_data.iloc[:,2:] ## FEATURE DATA
y= train_data.target ### LABEL DATA

### CHECKING DIMENSIONS
print(X.shape)
print(y.shape)

(595212, 218)
(595212,)


In [13]:
#### SPLITTING DATA INTO TRAIN AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=13)

In [14]:
## Label's Distribution

print("Train label distribution:")
print(Counter(y_train))

print("\nTest label distribution:")
print(Counter(y_test))

Train label distribution:
Counter({0: 458844, 1: 17325})

Test label distribution:
Counter({0: 114674, 1: 4369})


In [22]:
## Gradident Boosting Machine Learning Model
gbc = GradientBoostingClassifier(
    max_depth=5,
    n_estimators=500,
    warm_start=True,
    random_state=seed)
gbc.fit(X_train, y_train)

# make predictions
gbc_y_pred = gbc.predict(X_test)
gbc_y_pred_prob = gbc.predict_proba(X_test)

# calculate log loss
gbc_accuracy = accuracy_score(y_test, gbc_y_pred)
gbc_logloss = log_loss(y_test, gbc_y_pred_prob)

print("== Gradient Boosting ==")
print("Accuracy: {0:.2f}".format(gbc_accuracy))
print("Log loss: {0:.2f}".format(gbc_logloss))

== Gradient Boosting ==
Accuracy: 0.96
Log loss: 0.15


In [24]:
print('True labels:')
print(y_test[:5,])
print('\nPredicted labels:')
print(gbc_y_pred[:5,])
print('\nPredicted probabilities:')
print(gbc_y_pred_prob[:5,])

True labels:
240034    0
122822    0
242279    0
594193    0
146126    0
Name: target, dtype: int64

Predicted labels:
[0 0 0 0 0]

Predicted probabilities:
[[ 0.97341166  0.02658834]
 [ 0.96292129  0.03707871]
 [ 0.97509053  0.02490947]
 [ 0.98899783  0.01100217]
 [ 0.97871356  0.02128644]]


In [25]:
#### Predicition on test data ####
y_pred_RF_prob = gbc.predict_proba(test_data)
pred_values= pd.DataFrame(y_pred_RF_prob)
print(y_pred_RF_prob[1:5])
submission_simple_gradientboost= pd.DataFrame()
submission_simple_gradientboost['id'] = test_data_id

submission_simple_gradientboost['target'] = pd.DataFrame(pred_values.iloc[:,1])
submission_simple_gradientboost = submission_simple_gradientboost.set_index('id')

submission_simple_gradientboost.columns
submission_simple_gradientboost.head()
## Write to CSV
submission_simple_gradientboost.to_csv("Simple Gradient Boost.csv")

[[ 0.96990562  0.03009438]
 [ 0.97448856  0.02551144]
 [ 0.98523337  0.01476663]
 [ 0.95811327  0.04188673]]
