In [1]:
# External imports
import pickle as pk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import scipy.stats as st
from sklearn import linear_model, tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold



In [2]:
# Internal imports
import importlib
import extraction as ex
import features as feat
ex = importlib.reload(ex)
feat = importlib.reload(feat)

In [3]:
# Read data

# Small tables
train = ex.read_train()
test = ex.read_test()
members = ex.read_members()
useful_msno = set.union(
    set(train.index.unique()),
    set(test.index.unique())
)


READING TRAIN


Memory usage (MB) : 8.33381652832
Index       7.407837
is_churn    0.925980
dtype: float64


READING TEST


Memory usage (MB) : 6.92345428467
Index    6.923454
dtype: float64


READING MEMBERS


Memory usage (MB) : 135.57332325
Index                     51.646980
city                       6.455873
bd                        12.911745
gender                     6.455873
registered_via             6.455873
registration_init_time    51.646980
dtype: float64



In [None]:
# Big tables
transactions = ex.read_transactions(useful_msno=useful_msno, max_lines=np.inf, chunksize=10**6)
user_logs = ex.read_user_logs(useful_msno=useful_msno, just_date=False, max_lines=np.inf, chunksize=10**6)

In [None]:
# For train set, pretend we don't know what happens in March
transactions_train = transactions[transactions["transaction_date"] < pd.Timestamp(2017, 3, 1)]
user_logs_train = user_logs[user_logs["date"] < pd.Timestamp(2017, 3, 1)]

In [None]:
# EXPLOIT MEMBERS

def exploit_members(members):
    
    # Registration init
    registration_init = feat.count_days(members["registration_init_time"], base_date=pd.Timestamp(2000, 1, 1))

    # Sum up members data
    members_data = pd.DataFrame(index=members.index)
    members_data["registration_init_time"] = registration_init
    
    return members_data

In [None]:
# EXPLOIT TRANSACTIONS

def exploit_transactions(transactions, dataset="train"):

    # Grouping transactions
    grouped_trans = transactions.groupby("msno")
    # Groupby indices
    trans_msno = grouped_trans.max().index

    # Latest transaction line for each user
    last_trans_indices = (transactions['transaction_date'] == grouped_trans['transaction_date'].transform(max))
    last_transactions = transactions[last_trans_indices].drop_duplicates("msno")
    last_transactions.index = last_transactions["msno"]
    # Reindex with the same indices as the groupby
    last_transactions = last_transactions.reindex(trans_msno).drop("msno", axis=1)
    
    # Latest transaction
    last_trans_date = feat.count_days(grouped_trans["transaction_date"].max(), dataset)
    # Planned expiration
    last_expiration = feat.count_days(grouped_trans["membership_expire_date"].max(), dataset)
    # Number of transactions
    count_trans = grouped_trans["transaction_date"].count()

    # Mean plan days
    mean_plan_days = grouped_trans["payment_plan_days"].mean()
    # Total plan days
    total_plan_days = grouped_trans["payment_plan_days"].sum()
    # Last plan days
    last_plan_days = last_transactions["payment_plan_days"]

    # Freq auto-renew
    freq_auto_renew = grouped_trans["is_auto_renew"].mean()
    # Last auto-renew
    last_auto_renew = last_transactions["is_auto_renew"]

    # Freq cancel
    freq_cancel = grouped_trans["is_cancel"].mean()
    # Last cancel
    last_cancel = last_transactions["is_cancel"]
    # Exist cancel
    exist_cancel = (grouped_trans["is_cancel"].min() > 0).astype(np.int8)

    # Mean price
    mean_price = grouped_trans["actual_amount_paid"].sum()
    # Total price
    total_price = grouped_trans["actual_amount_paid"].sum()

    # Majority payment method
    # majority_payment_method = grouped_trans["payment_method_id"].agg(lambda x:x.value_counts().index[0])
    # Last payment method
    last_payment_method = last_transactions["payment_method_id"]

    transactions_data = pd.DataFrame(index=trans_msno)
    transactions_data["Last_transaction_date"] = last_trans_date
    transactions_data["Planned_membership_expiration"] = last_expiration
    transactions_data["Count_transactions"] = count_trans
    transactions_data["Mean_plan_days"] = mean_plan_days
    transactions_data["Total_plan_days"] = total_plan_days
    transactions_data["Last_plan_days"] = last_plan_days
    transactions_data["Freq_auto_renew"] = freq_auto_renew
    transactions_data["Last_auto_renew"] = last_auto_renew
    transactions_data["Freq_cancel"] = freq_cancel
    transactions_data["Last_cancel"] = last_cancel
    transactions_data["Exist_cancel"] = exist_cancel
    transactions_data["Total_price"] = total_price
    # transactions_data["Majority_payment_method"] = majority_payment_method
    transactions_data["Last_payment_method"] = last_payment_method
    transactions_data = feat.categorize(transactions_data, "Last_payment_method")
    
    return transactions_data

In [None]:
# EXPLOIT USER LOGS

def exploit_user_logs(user_logs, dataset="train"):
    
    if dataset == "train":
        user_logs_last_month = user_logs[
            (user_logs["date"] > pd.Timestamp(2017, 2, 1)) &
            (user_logs["date"] < pd.Timestamp(2017, 3, 1))
        ]
    elif dataset == "test":
        user_logs_last_month = user_logs[
            (user_logs["date"] > pd.Timestamp(2017, 3, 1)) &
            (user_logs["date"] < pd.Timestamp(2017, 4, 1))
        ]
    
    # Grouping user logs
    grouped_logs = user_logs.groupby("msno")
    grouped_logs_last_month = user_logs_last_month.groupby("msno")
    # Storing indices
    logs_msno = grouped_logs.max().index

    # First log
    first_log = feat.count_days(grouped_logs["date"].min(), dataset).fillna(-np.inf)
    # Last log
    last_log = feat.count_days(grouped_logs["date"].max(), dataset).fillna(-np.inf)

    # Number of logs
    total_logs = grouped_logs["date"].count().fillna(0)
    total_logs_last_month = grouped_logs_last_month["date"].count().fillna(0)
    
    if len(user_logs.columns) > 2:
        # Total unique songs
        total_unique_songs = grouped_logs["num_unq"].sum().fillna(0)
        total_unique_songs_last_month = grouped_logs_last_month["num_unq"].sum().fillna(0)
        # Total 100% songs
        total_100_songs = grouped_logs["num_100"].sum().fillna(0)
        total_100_songs_last_month = grouped_logs_last_month["num_100"].sum().fillna(0)
        # Total seconds
        total_seconds = grouped_logs["total_secs"].sum().fillna(0.)
        total_seconds_last_month = grouped_logs_last_month["total_secs"].sum().fillna(0.)

    logs_data = pd.DataFrame(index = logs_msno)
    logs_data["Last_log"] = last_log
    logs_data["Total_logs"] = total_logs
    logs_data["Total_logs_last_month"] = total_logs_last_month
    
    if len(user_logs.columns) > 2:
        logs_data["Total_unique_songs"] = total_unique_songs
        logs_data["Total_unique_songs_last_month"] = total_unique_songs_last_month
        logs_data["Total_100_songs"] = total_100_songs
        logs_data["Total_100_songs_last_month"] = total_100_songs_last_month
        logs_data["Total_seconds"] = total_seconds
        logs_data["Total_seconds_last_month"] = total_seconds_last_month
    
    return logs_data

In [None]:
# Extract and create features
members_data = exploit_members(members)
transactions_train_data = exploit_transactions(transactions_train, dataset="train")
transactions_data = exploit_transactions(transactions, dataset="test")
logs_train_data = exploit_user_logs(user_logs_train, dataset="train")
logs_data = exploit_user_logs(user_logs, dataset="test")

In [None]:
members_data.to_csv("/tmp/kaggle/junk/members_data.csv")
transactions_train_data.to_csv("/tmp/kaggle/junk/transactions_train_data.csv")
transactions_data.to_csv("/tmp/kaggle/junk/transactions_data.csv")

In [4]:
transactions_train_data = pd.read_csv(
    "/tmp/kaggle/junk/transactions_train_data.csv",
    index_col=0
)
transactions_data = pd.read_csv(
    "/tmp/kaggle/junk/transactions_data.csv",
    index_col=0
)

In [5]:
data_list_train = [transactions_train_data]
data_list = [transactions_data]
# For now, memory error in dealing with the user logs
# data_list_train.append(logs_train_data)
# data_list.append(logs_data)
# For now, nothing useful in the members
# data_list_train.append(members_data)
# data_list.append(members_data)

In [6]:
# Add the data to the train set and test dataframes
train_full = train.join(data_list_train, how="inner")
test_full = test.join(data_list, how="inner")

In [7]:
# Keep only the features we want
features = test_full.columns # [c for c in test_full.columns if not "payment_method" in c]
train_filtered, test_filtered = feat.select_features(train_full, test_full, features)

# Normalize the columns
train_filtered, test_filtered = feat.normalize_features(train_filtered, test_filtered)

In [8]:
def log_loss_score_func(y_true, y_pred):
    return log_loss(
        y_true, y_pred, labels=[0, 1],
        eps=np.power(10., -15), normalize=True)

log_loss_scorer = make_scorer(
    score_func=log_loss_score_func,
    greater_is_better=False,
    needs_proba=True
)

In [12]:
# Here comes the machine learning

# Conversion into arrays for scikit-learn
x = np.array(train_filtered.drop("is_churn", axis=1))
y = np.array(train_filtered["is_churn"])
xt = np.array(test_filtered)

# Train a logistic regression
xgbclf = xgb.XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.2)

xgbclf.fit(x, y, eval_metric="logloss")
yt = xgbclf.predict_proba(xt)[:, 1]

In [13]:
clf = xgb.XGBClassifier()
kf = KFold(n_splits=3)
scores = []
for ind_train, ind_test in kf.split(x):
    clf.fit(x[ind_train], y[ind_train], eval_metric="logloss")
    y_pred = clf.predict_proba(x[ind_test])
    scores.append(log_loss_score_func(y[ind_test], y_pred))
print("Cross validation log loss :", np.mean(scores))

Cross validation log loss : 0.194245291633


In [None]:
x = np.array(train_filtered.drop("is_churn", axis=1))
y = np.array(train_filtered["is_churn"])
xt = np.array(test_filtered)

xgbclf = xgb.XGBClassifier()

k = 1
param_grid = {
    "n_estimators": np.round(0.5 * np.power(10, np.linspace(2, 3, k))).astype(int),
    "max_depth": np.arange(3, 10, (10-3) // k),
    "learning_rate": np.power(10, np.linspace(-1.5, -0.2, k))
}

total_tasks=1
for k in param_grid:
    total_tasks *= len(param_grid[k])
print("Total tasks for grid search :", total_tasks, "\n")

gs = GridSearchCV(
    estimator=xgbclf,
    param_grid=param_grid,
    cv=3,
    scoring=log_loss_scorer,
    n_jobs=1, verbose=3)

gs.fit(x, y, eval_metric="logloss")  
best_xgbclf = gs.best_estimator_

pk.dump(gs.cv_results_, open("GS_results", "wb"))
pk.dump(gs.best_estimator_, open("GS_estimator", "wb"))

Total tasks for grid search : 1 

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] learning_rate=0.0316227766017, max_depth=3, n_estimators=50 .....
[CV]  learning_rate=0.0316227766017, max_depth=3, n_estimators=50, total=   7.1s
[CV] learning_rate=0.0316227766017, max_depth=3, n_estimators=50 .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


In [None]:
# Use best classifier for prediction
best_xgbclf.fit(x, y, eval_metric="logloss")
yt = best_xgbclf.predict_proba(xt)[:, 1]

# Zero prediction as baseline
percentage_churn = train_filtered["is_churn"].sum() / len(train_filtered)
test["is_churn"] = np.random.rand(len(test)) * percentage_churn
# For users on which we have more info, use it
test.loc[test_filtered.index, ["is_churn"]] = yt.reshape(-1, 1)

# Save as csv
submission = test.loc[:, ["is_churn"]]
submission.to_csv("data/submission.csv")