In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
import sys
import networkx as nx
import os
import warnings
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# Experiment Graph
from execution_environment import ExecutionEnvironment as ee

# Suppress warnings
warnings.filterwarnings('ignore')
% matplotlib inline


try:
    import pygraphviz
    from networkx.drawing.nx_agraph import graphviz_layout
except ImportError:
    try:
        import pydot
        from networkx.drawing.nx_pydot import graphviz_layout
    except ImportError:
        raise ImportError("This example needs Graphviz and either "
                          "PyGraphviz or pydot")
        
def plot_graph():
    plt.figure(figsize=(12, 12))
    #pos = nx.spring_layout(ee.graph.graph)
    pos = graphviz_layout(ee.graph.graph, prog='twopi', args='')
    color_map = []
    for node in ee.graph.graph.nodes(data=True):
        if node[1]['root']:
            color_map.append('green')
        elif node[1]['type'] == 'Dataset' or node[1]['type'] == 'Feature':
            color_map.append('red')
        elif node[1]['type'] == 'Agg' or node[1]['type'] == 'SK_Model':
            color_map.append('blue')
        elif node[1]['type'] == 'SuperNode':
            color_map.append('grey')
        else:
            color_map.append('black')

    nx.draw(ee.graph.graph,
            node_color=color_map,
            pos=pos,
            node_size=100)
    nx.draw_networkx_edge_labels(ee.graph.graph,
                                 pos=pos,
                                 edge_labels={(u, v): d["name"] for u, v, d in ee.graph.graph.edges(data=True)})
    plt.show()

In [2]:
ROOT_DIRECTORY = '../data/home-credit-default-risk'
print(os.listdir(ROOT_DIRECTORY))

['application_test.csv', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']


In [3]:
app_train = ee.load(ROOT_DIRECTORY + '/application_train.csv')
#print('Training data shape: ', app_train.shape().get())
#app_train.head().get()

In [4]:
app_test = ee.load(ROOT_DIRECTORY + '/application_test.csv')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, Imputer

# Make the model with the specified regularization parameter
sk_imputer = Imputer(strategy = 'median')
sk_log_reg = LogisticRegression(C = 0.0001)
train = app_train[['CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_ANNUITY','FLAG_DOCUMENT_18']]
test = app_test[['CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_ANNUITY','FLAG_DOCUMENT_18']]
# Fit on the training data
imputer = train.fit_sk_model(sk_imputer)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(test)

train_labels = app_train['TARGET']
# Train on the training data
log_reg = train.fit_sk_model_with_labels(sk_log_reg, train_labels)

In [None]:
pred = log_reg.predict_proba(test)

In [None]:
pred.get()[:]

In [5]:
temp = app_train['SK_ID_CURR'].concat(app_train['TARGET'])

In [12]:
temp = app_train.head(100)

In [15]:
from sklearn.model_selection import KFold
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 50)

for t, v in k_fold.split(temp.get()):
    print t
    print v

[ 0  1  2  4  5  6  7  9 10 11 12 13 14 19 20 21 22 23 24 26 27 28 29 30 31
 32 33 35 36 37 41 42 43 44 45 46 48 49 51 54 55 56 58 59 60 61 62 63 64 65
 66 67 68 70 71 72 73 74 75 76 77 78 81 82 83 84 85 86 88 89 90 91 92 93 94
 95 96 97 98 99]
[ 3  8 15 16 17 18 25 34 38 39 40 47 50 52 53 57 69 79 80 87]
[ 0  1  2  3  4  5  6  7  8  9 10 11 13 15 16 17 18 19 20 22 24 25 26 27 28
 29 30 31 33 34 35 36 38 39 40 41 43 45 46 47 48 49 50 51 52 53 55 56 57 60
 61 63 64 65 66 67 68 69 70 71 73 74 75 77 78 79 80 81 83 84 85 87 88 89 92
 93 94 96 97 99]
[12 14 21 23 32 37 42 44 54 58 59 62 72 76 82 86 90 91 95 98]
[ 0  2  3  4  5  6  8 10 11 12 14 15 16 17 18 19 20 21 22 23 25 26 29 30 31
 32 33 34 35 37 38 39 40 41 42 43 44 47 48 49 50 51 52 53 54 57 58 59 60 62
 63 64 66 67 68 69 70 71 72 73 74 76 77 78 79 80 82 83 84 85 86 87 89 90 91
 93 94 95 96 98]
[ 1  7  9 13 24 27 28 36 45 46 55 56 61 65 75 81 88 92 97 99]
[ 1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 20 21 22 23 24 25 26 27
 28

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = features.onehot_encode()
        test_features = test_features.onehot_encode()
        
        # Align the dataframes by the columns
        for c in features.get().columns:
            if c not in test_features.get().columns:
                features = features.drop(c)
                
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder object
        le = LabelEncoder()
        # List for storing categorical indices
        cat_indices = []

        for i, col in enumerate(features.select_dtypes('object').get().columns):
            # we are not using nunique because it discard nan
            label_encoder = features[col].fit_sk_model(le)

            transformed_feature = features.transform_col(features[col], col)
            features = features.drop(col)
            features = features.add_columns(transformed_feature, col)

            transformed_feature_test = model.transform_col(test_features[col], col)
            test_features = test_features.drop(col)
            test_features = test_features.add_columns(transformed_feature_test, col)
            cat_indices.append(i)    
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape().get())
    print('Testing Data Shape: ', test_features.shape().get())
    
    # Extract feature names
    feature_names = list(features.get().columns)


    # Create the model
    lgb_model = lgb.LGBMClassifier(n_estimators=10, objective = 'binary', 
                               class_weight = 'balanced', learning_rate = 0.05, 
                               reg_alpha = 0.1, reg_lambda = 0.1, 
                               subsample = 0.8, n_jobs = -1, random_state = 50)
        
    model = features.fit_sk_model_with_labels(lgb_model, labels, custom_args={'eval_metric':'auc',
                                                                             'categorical_feature':cat_indices,
                                                                             'verbose':200})    
        
    # Record the best iteration
    best_iteration = model.get().best_iteration_
    
        
    # Make predictions
    test_predictions = model.predict_proba(test_features, custom_args={'num_iteration':best_iteration})[1] 
        
    
    test_predictions.setname('TARGET')
    # Make the submission dataframe
    submission = test_ids.concat(test_predictions)
    
    feature_importances = model.feature_importances(feature_names)
   

    return submission, feature_importances

In [10]:
s,f = model(app_train, app_test)

('Training Data Shape: ', (307511, 241))
('Testing Data Shape: ', (48744, 241))


In [13]:
f.get()

Unnamed: 0,feature,importance
0,CNT_CHILDREN,0
1,AMT_INCOME_TOTAL,0
2,AMT_CREDIT,2
3,AMT_ANNUITY,1
4,AMT_GOODS_PRICE,3
5,REGION_POPULATION_RELATIVE,0
6,DAYS_BIRTH,37
7,DAYS_EMPLOYED,8
8,DAYS_REGISTRATION,0
9,DAYS_ID_PUBLISH,0


In [None]:
model = model.fit(app_train.get(),features['TARGET'].get(), )    