In [None]:
import os
from comet_ml import Experiment

experiment = Experiment(api_key=os.environ.get("COMET_API_KEY"), project_name="home-credit")
experiment.set_name('home-credit-eda')

In [None]:
import pandas as pd

filename = './application_train.csv'

df = pd.read_csv(filename, sep=',')
df.head()

In [None]:
df.shape

In [None]:
# integer valued features
integer_df = df.select_dtypes(include=['int'])
integer_df.drop(columns=['SK_ID_CURR', 'TARGET'], axis=1, inplace=True)

# some integer features are actually categorical features. So we need to extract these columns
binary_cols = [col for col in integer_df if (list(integer_df[col].unique()) == [0, 1]) or (list(integer_df[col].unique()) == [1, 0])]
integer_df.shape

In [None]:
# float valued features
float_df = df.select_dtypes(exclude=['int', 'object'])
float_df.shape

In [None]:
# categorical features
categorical_df = df.select_dtypes(exclude=['int', 'float'])
categorical_df.shape

In [None]:
# Plot target distribution
import matplotlib.pyplot as plt

feature = "TARGET"

ax = df[feature].value_counts().plot(kind='bar',
                                    figsize=(15,10), 
                                    color='blue')
ax.set_xlabel(feature, fontsize=14)
ax.set_ylabel("Count", fontsize=14)
experiment.log_figure(figure_name=feature, figure=plt)

In [None]:
# Plot categorical features
import matplotlib.pyplot as plt

categorical_features = list(df.select_dtypes(exclude=['int', 'float']).columns)

for feature in categorical_features:
    ax = df[feature].value_counts().plot(kind='bar',
                                    figsize=(15,10), 
                                    color='green')
    ax.set_xlabel(feature, fontsize=14)
    ax.set_ylabel("Count", fontsize=14)
    experiment.log_figure(figure_name=feature, figure=plt)
    plt.show()

In [None]:
# pca transformation function
from sklearn.decomposition import PCA
from sklearn import preprocessing

def do_pca(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    X_normalized = min_max_scaler.fit_transform(df)

    pca = PCA(n_components=10)
    components = pca.fit_transform(X_normalized)
    columns = ['pca-{}'.format(i+1) for i in range(len(components[0,:]))]
    
    return (pca, pd.DataFrame(components, columns=columns))
    

In [None]:
# plot correlation matrix for float type features
import matplotlib.pyplot as plt

float_columns = list(float_df.columns)
float_corr = float_df.corr()

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
cax = ax.matshow(float_corr, interpolation='nearest')
fig.colorbar(cax)
experiment.log_figure(figure_name='Float Features, Correlation Plot', figure=plt)

In [None]:
# Select float features for pca transformation
pca_candidates = float_df.iloc[:, 11:53]
pca_candidates = pca_candidates.fillna(method='ffill')
pca_candidates = pca_candidates.fillna(method='bfill')
pca, X_pca = do_pca(pca_candidates)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
plt.bar(X_pca.columns, pca.explained_variance_ratio_)
plt.ylabel('Explained Variance')
plt.xlabel('Principle Components')
experiment.log_figure(figure_name='Top 10 PCA Components', figure=plt)
plt.show()

In [None]:
X_encoded = pd.get_dummies(categorical_df)
X_encoded = pd.concat([X_encoded, integer_df[binary_cols]], axis=1)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
    
def rf_feature_plot(df, title):
    rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=31)
    rf.fit(df, y)

    features = list(df.columns)
    importances = rf.feature_importances_
    indices = np.argsort(importances).tolist()

    _tmp = indices[::-1][:15]
    _x = range(len(_tmp))

    feature_labels = [features[i] for i in _tmp]
    top_importance = [importances[i] for i in _tmp]

    plt.figure(figsize=(20,20))
    plt.barh(_x, top_importance[::-1], color='b', align='center')
    plt.yticks(_x, feature_labels[::-1], fontsize=14)
    plt.xlabel('Relative Importance')
    experiment.log_figure(figure_name=title, figure=plt)
    plt.show()

In [None]:
# Drop pca candidate features from float_df
# Drop categorical features from integer_df
float_columns = list(float_df.columns)

float_df.drop(columns=float_columns[11:53], inplace=True, axis=1)
integer_df.drop(columns=binary_cols, inplace=True, axis=1)

integer_df.fillna(method='ffill', inplace=True)
float_df.fillna(method='ffill', inplace=True)
float_df.fillna(method='bfill', inplace=True)

In [None]:
rf_feature_plot(X_encoded, 'RF Categorical Feature Importance')
rf_feature_plot(integer_df, 'RF Integer Feature Importance')
rf_feature_plot(float_df, 'RF Float Feature Importance')

In [None]:
float_df.head()

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt

def lightgbm_feature_plot(df, y, title):
    train_data = lgb.Dataset(data=df, label=y, feature_name=list(df.columns))
    param = {'num_leaves':31, 'num_trees':100, 'objective':'binary', 'metric' : 'auc'}
    num_round = 10
    
    bst = lgb.train(param, train_data, num_round)
    ax = lgb.plot_importance(bst, max_num_features=15, figsize=(15, 10), height=0.5)
    experiment.log_figure(figure_name=title, figure=plt)

In [None]:
lightgbm_feature_plot(float_df, df['TARGET'], 'LightGBM Float Feature Importance')
lightgbm_feature_plot(integer_df, df['TARGET'],'LightGBM Integer Feature Importance')
lightgbm_feature_plot(X_encoded, df['TARGET'], 'LightGBM Categorical Feature Importance')

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
normalized_integer_df = pd.DataFrame(scaler.fit_transform(integer_df))

float_df.fillna(method='bfill', inplace=True)
normalized_float_df = pd.DataFrame(scaler.fit_transform(float_df))

In [None]:
normalized_integer_df.columns = integer_df.columns

In [None]:
normalized_float_df.columns = float_df.columns

In [None]:
X_final = pd.concat([df['SK_ID_CURR'], normalized_float_df, normalized_integer_df, X_encoded, X_pca], axis=1)
X_final.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_final, df['TARGET'], test_size=0.1, random_state=42)

In [None]:
train = pd.concat([X_train, y_train], axis=1)
valid = pd.concat([X_valid, y_valid], axis=1)

In [None]:
train.shape

In [None]:
train.to_csv('home-credit-train.csv', sep=',')
valid.to_csv('home-credit-valid.csv', sep=',')