In [1]:
import tensorflow as tf
import feather
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  from ._conv import register_converters as _register_converters


In [95]:
def tensorflow_logistic_regression(train_data, target, test_df = None, training_epochs = 10, learning_rate = 0.005):
    """
    Performs the tensorflow operations for training a basic logistic regression model on the specified
    dataframe, and returns the model's predictions for the full data set.  It is intended to be used with
    the supplemental files for Kaggle's Credit Risk competition, in order to generate some new features/
    implement model stacking.
    """
    ncol = train_data.shape[1]
    
    X = tf.placeholder(tf.float32, [None, ncol], name = "X")
    Y = tf.placeholder(tf.float32, [None, 1], name = "Y")
    weights = tf.Variable(tf.zeros([ncol,1]))
    bias = tf.Variable(tf.zeros([1]))
    
    pred = tf.sigmoid(tf.add(tf.matmul(X, weights), bias))
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = pred, labels = Y))
    auc = tf.metrics.auc(labels = Y, predictions = pred, name = "auc")
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    init = tf.global_variables_initializer()
    init_loc = tf.local_variables_initializer()
    auc_history = []
    
    with tf.Session() as sess:
        sess.run(init)
        sess.run(init_loc)
        
        for epoch in range(training_epochs):
            train_train, train_val, target_train, target_val = train_test_split(train_data,target)
            
            c,_ = sess.run([cost, optimizer], feed_dict = {X:train_train, Y: target_train})
            predictions = sess.run(pred, feed_dict = {X:train_val, Y: target_val})
            auc_history.append(sess.run(auc, feed_dict = {X:train_val, Y: target_val})[1])
            
        full_train_predictions = sess.run(pred, feed_dict = {X:train_data, Y:target})
    
    return full_train_predictions, auc_history

In [93]:
def log_regress_other_files(train_IDs, target, file_df):
    file_df = file_df.loc[file_df["SK_ID_CURR"].isin(train_IDs),:]
    target = target[target["SK_ID_CURR"].isin(file_df["SK_ID_CURR"])]["TARGET"]
    
    file_data = file_df.drop("SK_ID_CURR", axis = 1).values.astype("float32")
    target = target.values.reshape([len(target),1]).astype("float32")
    
    sc = StandardScaler()
    file_data = sc.fit_transform(file_data)
    
    predictions, auc_scores = tensorflow_logistic_regression(file_data, target)
    
    return predictions, auc_scores

In [6]:
train = pd.read_feather("./../Solution attempts/v09 train data.feather")
target = pd.read_feather("./../Solution attempts/v09 target.feather")["TARGET"]

In [None]:
cols_to_fill = ["TOTAL_CURRENT_CREDIT_AMT", "TOTAL_CURRENT_CREDIT_DEBT", "CREDIT_COUNT", "NUM_CREDIT_ACTIVE",
                "NUMBER_APPROVED", "NUMBER_CANCELED","NUMBER_REFUSED", "NUMBER_UNUSED", "NUMBER_APPLICATIONS",
                "ANY_OVERDUE", "NUM_LATE_CC_PAYMENTS","MAX_CREDIT_LIMIT","NUM_PREV_CC_LOANS","NUM_PAYMENTS_UNDER",
                "NUM_PAYMENTS_LATE","NUM_LATE_POS_PAYMENTS", "NUM_CREDIT_CLOSED", "MAX_DPD", "MAX_DRAWINGS_IN_MONTH",
                "DAYS_EMPLOYED", "OBS_30_CNT_SOCIAL_CIRCLE", "DEF_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE",
                "DEF_60_CNT_SOCIAL_CIRCLE"]

train[cols_to_fill] = train[cols_to_fill].fillna(0)

In [None]:
full_cols = train.apply(lambda x: sum(x.isnull()), axis = 0)
full_cols = full_cols[full_cols < 3000].index.tolist()
train = train[full_cols]
train.dropna(inplace = True)

In [None]:
kept_indices = train.index

In [None]:
train = train.values.astype('float32')
target = target[kept_indices]
target = target.values.reshape([len(target),1]).astype('float32')

In [None]:
auc_history

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(target_val, predictions)

In [None]:
roc_auc_score(target, full_train_predictions)

In [79]:
bureau_sub = pd.read_feather("bureau_sub.feather")
target = pd.read_feather("./../Solution attempts/v09 target.feather")

In [96]:
predictions, auc_scores = log_regress_other_files(train["SK_ID_CURR"], target, bureau_sub)

In [98]:
auc_scores

[0.5875257,
 0.59931827,
 0.6029048,
 0.60504097,
 0.6051328,
 0.60614395,
 0.6065419,
 0.6076418,
 0.6081058,
 0.60809624]