In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.float_format', '{:.3f}'.format)
# pd.reset_option('display.float_format')

pd.reset_option('display.float_format')# Clients


In [None]:

client_dtypes = {
   'CLIENT_ID': 'uint64',
   'TARGET': 'boolean',
   'IS_TRAIN': 'boolean',
}
clients = pd.read_csv(
   'data/samples/CLIENTS_SAMPLE.csv',
   sep=',',
   dtype=client_dtypes
)

# Rename columns to be lowercase
clients = clients.rename(columns=str.lower)

# Convert bool columns to 0 and 1
clients = clients.astype({col: 'int8' for col, dtype in zip(clients.columns, clients.dtypes) if dtype in ('bool', 'boolean')})

In [None]:
clients.head()

In [None]:
clients.info()

In [None]:
# clients.describe()
clients[['target', 'is_train']].mean().to_frame('Fraction')

# Transactions

In [None]:
from utils import read_transactions

transactions = read_transactions('data/samples/TRANSACTIONS_SAMPLE.csv')


In [None]:
transactions.head()

In [None]:
transactions.info()

In [None]:
transactions.describe()

In [None]:
# Get the mean of transactions by client
transactions.groupby('client_id').size().mean()

In [None]:
# Categorical data plots

cat_cols = ['cat_c2', 'cat_c3', 'cat_c4']
nrows = 1
ncols = 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8 * ncols, 5 * nrows))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    ax = axes[i]
    counts = transactions[col].astype(int).value_counts().head(20).sort_index()
    counts.plot(kind='bar', ax=ax)
    ax.set_title(col)
    ax.set_xlabel('Category')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

# counts = transactions_sample['cat_c2'].astype(int).value_counts().head(20).sort_index()
# counts.plot(kind="bar", figsize=(12, 5))

# sns.countplot(x="cat_c2", data=transactions_sample, order=sorted(transactions_sample['cat_c2'].unique()))
# plt.xticks(rotation=90)
# plt.title("Distribution of cat_c2")
# plt.show()

In [None]:
# Categorical data counts

for col in ['cat_c2', 'cat_c3', 'cat_c4']:
    print(transactions[col].astype(int).value_counts().sort_index())
    print()

In [None]:
float_cols = ['float_c16', 'float_c17', 'float_c18', 'float_c20', 'float_c21']
nrows = 2
ncols = 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8 * ncols, 5 * nrows))
axes = axes.flatten()

for i, col in enumerate(float_cols):
    ax = axes[i]
    data = transactions[col][transactions[col] != 0]
    data_filtered = data[data.between(data.quantile(.05), data.quantile(.95))]
    counts = data_filtered.hist(ax=ax, bins=200)
    ax.set_title(col)
    ax.tick_params(axis='x', rotation=0)

for j in range(len(float_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(transactions[['float_c16', 'float_c17', 'float_c18', 'float_c20', 'float_c21']])

In [None]:
# Correlation matrix
transactions[['float_c16', 'float_c17', 'float_c18', 'float_c20', 'float_c21']].corr()

## Transactions (data insights)

In [None]:
# fl_c6 = 1 (True) only if float_c17 <= 0

transactions[transactions['fl_c6'] == 1]['float_c17'].max()
# transactions[transactions['float_c17'] < 0]['fl_c6'].min()

In [None]:
# int_c19 = 1 only if float_c16 <= 0 and int_c19 = -1 only if float_c16 >= 0

transactions[(transactions['int_c19'] == -1)]['float_c16'].max()

# App activity

In [None]:
from utils import preprocess_app_activity_data, read_app_activity

app_activity = read_app_activity('data/samples/APP_ACTIVITY_SAMPLE.csv')
app_activity = preprocess_app_activity_data(app_activity)

In [None]:
app_activity.head()

In [None]:
app_activity.info(show_counts=True)

In [None]:
app_activity.describe()

In [None]:
# Get the mean of app activity by client
app_activity.groupby('client_id').size().mean()

In [None]:
cat_cols = ['cat_c3', 'cat_c4', 'cat_c5', 'cat_c6', 'cat_c9']
nrows = 2
ncols = 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8 * ncols, 5 * nrows))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    ax = axes[i]
    counts = app_activity[col].value_counts().head(20).sort_index()
    counts.plot(kind='bar', ax=ax)
    ax.set_title(col)
    ax.set_xlabel('Category')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=0)

for j in range(len(cat_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
# Categorical data counts

for col in cat_cols:
    print(app_activity[col].value_counts().sort_index())
    print()

In [None]:
# float_cols = ['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']
float_cols = ['float_c11', 'float_c12', 'float_c14']
nrows = 3
ncols = 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8 * ncols, 5 * nrows))
axes = axes.flatten()

for i, col in enumerate(float_cols):
    ax = axes[i]
    counts = app_activity[col].hist(ax=ax, bins=200)
    ax.set_title(col)
    ax.tick_params(axis='x', rotation=0)

for j in range(len(float_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
# sns.pairplot(app_activity[['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']])
sns.pairplot(app_activity[['float_c11', 'float_c12', 'float_c14']])

In [None]:
# Correlation matrix
# app_activity[['float_c11', 'float_c12', 'float_c13', 'float_c14', 'float_c15', 'float_c16', 'float_c17']].corr()
app_activity[['float_c11', 'float_c12', 'float_c14']].corr()

# Communications

In [None]:
from utils import preprocess_comm_data, read_communications

communications = read_communications('data/samples/COMMUNICATIONS_SAMPLE.csv')
communications = preprocess_comm_data(communications)

In [None]:
communications.head()

In [None]:
communications.info()

In [None]:
communications.describe()

In [None]:
# Get the mean of communications by client
communications.groupby('client_id').size().mean()

In [None]:
cat_cols = ['cat_c2', 'cat_c3', 'cat_c4', 'cat_c5']
nrows = 2
ncols = 3

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(8 * ncols, 5 * nrows))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    ax = axes[i]
    counts = communications[col].value_counts().head(20).sort_index()
    counts.plot(kind='bar', ax=ax)
    ax.set_title(col)
    ax.set_xlabel('Category')
    ax.set_ylabel('Count')
    # ax.tick_params(axis='x', rotation=0)

for j in range(len(cat_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
# Categorical data counts

# for col in cat_cols:
#     print(communications[col].value_counts().sort_index())
#     print()

communications['cat_c5'].value_counts().sort_index()