In [1]:
# External imports
import pandas as pd
import numpy as np

In [2]:
# External imports 2
from sklearn import linear_model, svm, ensemble
from sklearn import dummy
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, make_scorer

In [23]:
# Internal imports
import importlib
import extraction as ex
import features as feat
ex = importlib.reload(ex)
feat = importlib.reload(feat)

In [4]:
# All dataframes except user logs
train = ex.read_train()
test = ex.read_test()
members = ex.read_members()
transactions = ex.read_transactions(train, test, max_lines=10**8)


READING TRAIN


Memory usage (MB) : 8.33381652832
Index       7.407837
is_churn    0.925980
dtype: float64


READING TEST


Memory usage (MB) : 6.92345428467
Index    6.923454
dtype: float64


READING MEMBERS


Memory usage (MB) : 135.57332325
Index                     51.646980
city                       6.455873
bd                        12.911745
gender                     6.455873
registered_via             6.455873
registration_init_time    51.646980
dtype: float64


READING TRANSACTIONS

Chunk 0 of table 1 read
Chunk 1 of table 1 read
Chunk 2 of table 1 read
Chunk 3 of table 1 read
Chunk 4 of table 1 read
Chunk 5 of table 1 read
Chunk 6 of table 1 read
Chunk 7 of table 1 read
Chunk 8 of table 1 read
Chunk 9 of table 1 read
Chunk 10 of table 1 read
Chunk 11 of table 1 read
Chunk 12 of table 1 read
Chunk 13 of table 1 read
Chunk 14 of table 1 read
Chunk 15 of table 1 read
Chunk 16 of table 1 read
Chunk 17 of table 1 read
Chunk 18 of table 1 read
Chunk 19 of table 1 read
Chunk 20 o

In [27]:
# Read user logs
user_logs = ex.read_user_logs(train, test, max_lines=10**7, chunksize=10**6)


READING USER LOGS

Chunk 0 of table 1 read
Chunk 1 of table 1 read
Chunk 2 of table 1 read
Chunk 3 of table 1 read
Chunk 4 of table 1 read
Chunk 5 of table 1 read
Chunk 6 of table 1 read
Chunk 7 of table 1 read
Chunk 8 of table 1 read
Chunk 9 of table 1 read
Chunk 10 of table 1 read
Chunk 11 of table 1 read
Chunk 12 of table 1 read
Chunk 13 of table 1 read
Chunk 14 of table 1 read
Chunk 15 of table 1 read
Chunk 16 of table 1 read
Chunk 17 of table 1 read
Chunk 18 of table 1 read
Chunk 19 of table 1 read
Chunk 20 of table 1 read
Chunk 21 of table 1 read
Chunk 22 of table 1 read
Chunk 23 of table 1 read
Chunk 24 of table 1 read
Chunk 25 of table 1 read
Chunk 26 of table 1 read
Chunk 27 of table 1 read
Chunk 28 of table 1 read
Chunk 29 of table 1 read
Chunk 30 of table 1 read
Chunk 31 of table 1 read
Chunk 32 of table 1 read
Chunk 33 of table 1 read
Chunk 34 of table 1 read
Chunk 35 of table 1 read
Chunk 36 of table 1 read
Chunk 37 of table 1 read
Chunk 38 of table 1 read
Chunk 39 of tab

KeyboardInterrupt: 

In [5]:
# Get useful users, appearing in the tables we are interested in
train_useful = feat.get_useful_users(train, members=members, transactions=transactions, user_logs=None)
test_useful = feat.get_useful_users(test, members=members, transactions=transactions, user_logs=None)

Getting useful users
- Members
- Transactions
Getting useful users
- Members
- Transactions


In [6]:
# Exploring members
modified_members = members.copy().drop("registration_init_time", axis=1)

In [7]:
# City (categories)
modified_members = feat.categorize(modified_members, "city")
# Bd (categories)
modified_members = feat.categorize(modified_members, "bd")
# Gender (categories)
modified_members = feat.categorize(modified_members, "gender")
# Registered via (categories)
modified_members = feat.categorize(modified_members, "registered_via")

In [8]:
# Registration init time
registration_init = pd.DataFrame(
    data = feat.count_days(members["registration_init_time"]),
    index = modified_members.index,
    columns=["registration_init_time"]
)

In [9]:
# Exploring transactions
grouped_trans = transactions.groupby("msno")

In [10]:
# Latest transactions and planned expiration
latest_trans = grouped_trans.max()
latest_trans = latest_trans.loc[:, ["transaction_date", "membership_expire_date"]]
latest_trans = latest_trans.apply(feat.count_days)
latest_trans.columns = ["latest_transaction_date", "planned_membership_expire_date"]

In [11]:
# Transaction duration
trans_dates = transactions.loc[:, ["msno", "membership_expire_date", "transaction_date"]]
trans_dur = trans_dates["membership_expire_date"]-trans_dates["transaction_date"]
trans_dur = feat.count_days(trans_dur)
trans_dates = trans_dates.assign(mean_transaction_duration=trans_dur)
mean_trans_dates = trans_dates.groupby("msno").mean()
mean_trans_dur = mean_trans_dates.loc[:, ["mean_transaction_duration"]]

In [12]:
# Auto-renew and cancel
trans_caracs = grouped_trans.mean()
trans_caracs = trans_caracs.loc[:, ["is_auto_renew", "is_cancel"]]
trans_caracs.columns = ["auto_renew_freq", "cancel_freq"]

In [None]:
# Exploring user logs
grouped_logs = user_logs.groupby("msno")

In [None]:
# Mean logs
mean_logs = grouped_logs.mean()
cols = mean_logs.columns
mean_logs.columns = ["mean_" + col for col in cols]

In [None]:
# Latest listening session
latest_session = grouped_logs.max().loc[:, ["date"]]
latest_session = latest_session.apply(feat.count_days)
latest_session.columns = ["latest_listening_session"]

In [None]:
# Number of listening sessions
logs_count = grouped_logs.count().loc[:, ["date"]]
logs_count.columns = ["number_listening_sessions"]

In [13]:
# Merge all extracted data
members_data = [modified_members, registration_init]
transactions_data = [latest_trans, mean_trans_dur, trans_caracs]
# user_logs_data = [latest_session, logs_count, mean_logs]
user_logs_data = []

train_full = train_useful.copy()
test_full = test_useful.copy()

# Make a biiiiig dataframe
for df in user_logs_data + transactions_data + members_data:
    # Select only the interesting part of the index
    df_train = df.reindex(train_full.index)
    df_test = df.reindex(test_full.index)
    # Concatenate
    train_full = pd.concat([train_full, df_train], axis=1)
    test_full = pd.concat([test_full, df_test], axis=1)

In [14]:
# Normalize features
for c in test_full.columns:
    # If not binary:
    if train_full[c].max() > 1.001 or train_full[c].min() < -0.001:
        # Standardize in train, and in test with the scale parameters of train
        m = train_full[c].mean()
        s = train_full[c].std()
        train_full[c] = (train_full[c] - m) / s
        test_full[c] = (test_full[c] - m) / s   

In [15]:
train_full.head()

Unnamed: 0_level_0,is_churn,latest_transaction_date,planned_membership_expire_date,mean_transaction_duration,auto_renew_freq,cancel_freq,city_1,city_3,city_4,city_5,...,registered_via_9,registered_via_10,registered_via_11,registered_via_13,registered_via_14,registered_via_16,registered_via_17,registered_via_18,registered_via_19,registration_init_time
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,-0.384389,-0.496194,-0.170748,0.833333,0.166667,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.091752
f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,-0.070034,-0.263737,-0.10383,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.091752
zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,0.015699,0.995403,-0.088343,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.095397
8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,-10.92956,5.005281,4.532287,0.965517,0.103448,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.107241
K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,0.072854,2.835685,1.632687,1.0,0.09375,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.121819


In [18]:
# Select features to use
features = [
    "latest_transaction_date",
    "registration_init_time",
    "mean_transaction_duration",
    "auto_renew_freq",
    "cancel_freq"
]
# registration_modes = [int(c[15:]) for c in train_full.columns if c[:14] == "registered_via"]
# features += ["registered_via_" + str(i) for i in registration_modes]

train_filtered = train_full.loc[:, features + ["is_churn"]]
test_filtered = test_full.loc[:, features]

In [19]:
# Here comes the machine learning

# Conversion into arrays for scikit-learn
x = np.array(train_filtered.drop("is_churn", axis=1))
y = np.array(train_filtered["is_churn"])
xt = np.array(test_filtered)

# Train a logistic regression
clf = linear_model.LogisticRegression()
# clf = linear_model.Ridge(alpha=0.)
clf.fit(x, y)

try:
    # Compute the probability of belonging to class 1 (and not 0)
    proba = True
    yt = clf.predict_proba(xt)[:, 1]
except AttributeError:
    # If impossible for this classifier, predict the value of the class
    # and restrict to the interval [0, 1]
    proba = False
    yt = clf.predict(xt)
    yt[yt < 0] = 0.
    yt[yt > 1] = 1.

In [20]:
# Perform cross-validation
log_loss_scorer = make_scorer(
    score_func=lambda y_true, y_pred: log_loss(
        y_true, y_pred, labels=[0, 1],
        eps=np.power(10., -15), normalize=True),
    greater_is_better=True,
    needs_proba=proba
)
scores = cross_val_score(
    estimator=clf,
    X=x,
    y=y,
    cv=5,
    scoring=log_loss_scorer
)
print("CV score (log-loss) : {}".format(scores.mean()))

CV score (log-loss) : 0.20387236196231484


In [21]:
# Zero prediction as baseline
percentage_churn = train_filtered["is_churn"].sum() / len(train_filtered)
test["is_churn"] = np.random.rand(len(test)) * percentage_churn
# For users on which we have more info, use it
test.loc[test_filtered.index, ["is_churn"]] = yt.reshape(-1, 1)

# Save as csv
submission = test.loc[:, ["is_churn"]]
submission.to_csv("data/submission.csv")