In [1]:
# External imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import linear_model, tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, make_scorer



In [2]:
# Internal imports
import importlib
import extraction as ex
import features as feat
ex = importlib.reload(ex)
feat = importlib.reload(feat)

In [3]:
# Read data

# Small tables
train = ex.read_train()
test = ex.read_test()
members = ex.read_members()
useful_msno = set.union(
    set(train.index.unique()),
    set(test.index.unique())
)


READING TRAIN


Memory usage (MB) : 8.33381652832
Index       7.407837
is_churn    0.925980
dtype: float64


READING TEST


Memory usage (MB) : 6.92345428467
Index    6.923454
dtype: float64


READING MEMBERS


Memory usage (MB) : 135.57332325
Index                     51.646980
city                       6.455873
bd                        12.911745
gender                     6.455873
registered_via             6.455873
registration_init_time    51.646980
dtype: float64



In [4]:
# Big tables
transactions = ex.read_transactions(useful_msno=useful_msno, max_lines=np.inf, chunksize=10**6)
user_logs = ex.read_user_logs(useful_msno=useful_msno, just_date=True, max_lines=10**8, chunksize=10**6)


READING TRANSACTIONS

Chunk 1 of transactions read
Chunk 2 of transactions read
Chunk 3 of transactions read
Chunk 4 of transactions read
Chunk 5 of transactions read
Chunk 6 of transactions read
Chunk 7 of transactions read
Chunk 8 of transactions read
Chunk 9 of transactions read
Chunk 10 of transactions read
Chunk 11 of transactions read
Chunk 12 of transactions read
Chunk 13 of transactions read
Chunk 14 of transactions read
Chunk 15 of transactions read
Chunk 16 of transactions read
Chunk 17 of transactions read
Chunk 18 of transactions read
Chunk 19 of transactions read
Chunk 20 of transactions read
Chunk 21 of transactions read
Chunk 22 of transactions read
Chunk 23 of transactions read
Chunk 24 of transactions read

Memory usage (MB) : 669.92029953
Index                     130.716156
msno                      130.716156
payment_method_id          16.339520
payment_plan_days          32.679039
plan_list_price            32.679039
actual_amount_paid         32.679039
is_auto_re

In [5]:
# For train set, pretend we don't know what happens in March
transactions_train = transactions[transactions["transaction_date"] < pd.Timestamp(2017, 3, 1)]
user_logs_train = user_logs[user_logs["date"] < pd.Timestamp(2017, 3, 1)]

In [6]:
# EXPLOIT MEMBERS

def exploit_members(members):
    
    # Registration init
    registration_init = feat.count_days(members["registration_init_time"])

    # Sum up members data
    members_data = pd.DataFrame(index=members.index)
    members_data["registration_init_time"] = registration_init
    
    return members_data

In [7]:
# EXPLOIT TRANSACTIONS

def exploit_transactions(transactions):

    # Grouping transactions
    grouped_trans = transactions.groupby("msno")
    trans_msno = grouped_trans.max().index

    # Latest transaction
    latest_trans = feat.count_days(grouped_trans["transaction_date"].max())
    # Planned expiration
    expiration = feat.count_days(grouped_trans["membership_expire_date"].max())
    # Transaction duration
    trans_dur = grouped_trans["payment_plan_days"].mean()
    # Auto-renew
    auto_renew_freq = grouped_trans["is_auto_renew"].mean()
    # Cancel
    cancel_freq = grouped_trans["is_cancel"].mean()
    # Total price
    total_price = grouped_trans["actual_amount_paid"].sum()
    # Payment method
    # payment_method = grouped_trans["payment_method_id"].agg(lambda x:x.value_counts().index[0])

    transactions_data = pd.DataFrame(index=trans_msno)
    transactions_data["Latest_transaction"] = latest_trans
    transactions_data["Planned_membership_expiration"] = expiration
    transactions_data["Mean_transaction_duration"] = trans_dur
    transactions_data["Auto_renew_freq"] = auto_renew_freq
    transactions_data["Cancel_freq"] = cancel_freq
    transactions_data["Total_price"] = total_price
    # transactions_data["Payment_method"] = payment_method
    
    return transactions_data

In [8]:
# EXPLOIT USER LOGS

def exploit_user_logs(user_logs):
    
    # Grouping user logs
    grouped_logs = user_logs.groupby("msno")
    logs_msno = grouped_logs.max().index

    # Latest log
    latest_logs = grouped_logs.max().loc[:, ["date"]].apply(feat.count_days)
    # Number of logs
    logs_count = grouped_logs.count().loc[:, ["date"]]

    logs_data = pd.DataFrame(index = logs_msno)
    logs_data["Latest_log"] = latest_logs
    logs_data["Logs_count"] = logs_count
    logs_data["Log_last10"] = (latest_logs > -10).astype(int)
    logs_data["Log_last30"] = (latest_logs > -30).astype(int)
    logs_data["Log_last60"] = (latest_logs > -60).astype(int)
    logs_data["Log_last90"] = (latest_logs > -90).astype(int)
    
    return logs_data

In [9]:
members_data = exploit_members(members)
transactions_train_data = exploit_transactions(transactions_train)
transactions_data = exploit_transactions(transactions)
logs_train_data = exploit_user_logs(user_logs_train)
logs_data = exploit_user_logs(user_logs)

In [10]:
data_list_train = [members_data, transactions_train_data, logs_train_data]
data_list = [members_data, transactions_data, logs_data]

In [11]:
# Add the data to the train set and test dataframes
train_full = train.join(data_list_train, how="inner")
test_full = test.join(data_list, how="inner")

In [12]:
train_full.head()

Unnamed: 0_level_0,is_churn,registration_init_time,Latest_transaction,Planned_membership_expiration,Mean_transaction_duration,Auto_renew_freq,Cancel_freq,Total_price,Latest_log,Logs_count,Log_last10,Log_last30,Log_last60,Log_last90
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,-1164,-1,29,10.0,0.833333,0.166667,894,-613,12,0,0,0,0
f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,-1164,-19,11,25.4,0.0,0.0,627,-11,26,0,1,1,1
zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,-1160,-30,6,30.0,0.0,0.0,2682,-21,88,0,1,1,1
K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,-1131,-13,170,27.096774,1.0,0.096774,3649,-95,65,0,0,0,0
ibIHVYBqxGwrSExE63/omeDD99M5vYB3CN2HzkEY+eM=,1,-1130,-14,13,28.55,1.0,0.05,2980,-23,39,0,1,1,1


In [20]:
# Keep only the features we want
features = ["Latest_transaction", "Planned_membership_expiration", "Auto_renew_freq", "Total_price"]
train_filtered, test_filtered = feat.select_features(train_full, test_full, features)

# Normalize the columns
# train_filtered, test_filtered = feat.normalize_features(train_filtered, test_filtered)

In [21]:
# Here comes the machine learning

# Conversion into arrays for scikit-learn
x = np.array(train_filtered.drop("is_churn", axis=1))
y = np.array(train_filtered["is_churn"])
xt = np.array(test_filtered)

# Train a logistic regression
# clf = linear_model.LogisticRegression()
# clf = linear_model.Ridge(alpha=0.)
# clf = tree.DecisionTreeClassifier()
clf = xgb.XGBClassifier()

clf.fit(x, y)

try:
    # Compute the probability of belonging to class 1 (and not 0)
    proba = True
    yt = clf.predict_proba(xt)[:, 1]
except AttributeError:
    # If impossible for this classifier, predict the value of the class
    # and restrict to the interval [0, 1]
    proba = False
    yt = clf.predict(xt)
    yt[yt < 0] = 0.
    yt[yt > 1] = 1.

In [22]:
# Perform cross-validation
log_loss_scorer = make_scorer(
    score_func=lambda y_true, y_pred: log_loss(
        y_true, y_pred, labels=[0, 1],
        eps=np.power(10., -15), normalize=True),
    greater_is_better=True,
    needs_proba=proba
)
scores = cross_val_score(
    estimator=clf,
    X=x,
    y=y,
    cv=5,
    scoring=log_loss_scorer
)
print("CV score (log-loss) : {}".format(scores.mean()))

CV score (log-loss) : 0.19114117498494826


In [None]:
# Zero prediction as baseline
percentage_churn = train_filtered["is_churn"].sum() / len(train_filtered)
test["is_churn"] = np.random.rand(len(test)) * percentage_churn
# For users on which we have more info, use it
test.loc[test_filtered.index, ["is_churn"]] = yt.reshape(-1, 1)

# Save as csv
submission = test.loc[:, ["is_churn"]]
submission.to_csv("data/submission.csv")