In [12]:
import pandas as pd
from functools import reduce

# Not converting directly to int32 since there are float & NaN which causes problem
user_logs = pd.read_csv(
    "../data/raw/user_logs.csv",
    usecols=[
        "msno", "date",
        "num_25", "num_50", "num_75",
        "num_985", "num_100",
        "num_unq", "total_secs"
    ],
    dtype={
        "msno": "category",
        "num_25": "Int16",
        "num_50": "Int16",
        "num_75": "Int16",
        "num_985": "Int16",
        "num_100": "Int16",
        "num_unq": "Int16",
        "total_secs": "float32"
    },
    low_memory=True
)

# Using this additional step since there could be float, NaN in above for total_secs and we are now cleaning before converting to int32
user_logs["total_secs"] = (
    user_logs["total_secs"]
    .fillna(0)
    .round()
    .astype("int32")
)


user_logs["date"] = pd.to_datetime(
    user_logs["date"], format="%Y%m%d", cache=True
)

# Taken from outut of previous notebook
FINAL_T = pd.to_datetime('2016-08-23')

# Defining time interval for immediate/mid/long term periods in form of double-valued dictionary
windows = {
    "recent": (FINAL_T - pd.Timedelta(days=30), FINAL_T),
    "mid":    (FINAL_T - pd.Timedelta(days=90), FINAL_T - pd.Timedelta(days=60)),
    "long":   (FINAL_T - pd.Timedelta(days=180), FINAL_T - pd.Timedelta(days=150)),
}


def aggregate_column_across_windows(logs, column, windows):
    feats = []

    for wname, (start, end) in windows.items():
        temp = logs[
            (logs["date"] >= start) &
            (logs["date"] < end)
        ]

        agg = (
            temp
            .groupby("msno", observed=True)[column]
            .sum()
            .rename(f"{wname}_{column}")
            .reset_index()
        )

        feats.append(agg)

    out = reduce(
        lambda l, r: l.merge(r, on="msno", how="outer"),
        feats
    )
    out.fillna(0, inplace=True)

    return out

# Name of columns in user_logs
log_cols = [
    "num_25", "num_50", "num_75",
    "num_985", "num_100",
    "num_unq", "total_secs"
]

univariate = []

for col in log_cols:
    feats = aggregate_column_across_windows(user_logs, col, windows)
    temp = baseline[["msno", "is_churn"]].merge(feats, on="msno", how="left").fillna(0)

    for w in windows.keys():
        f = f"{w}_{col}"
        med = temp.groupby("is_churn")[f].median()

        univariate.append({
            "column": col,
            "window": w,
            "median_churn": med.loc[1],
            "median_nonchurn": med.loc[0],
            "ratio": (med.loc[0] + 1e-6) / (med.loc[1] + 1e-6)
        })


uni_df = pd.DataFrame(univariate)
uni_df.sort_values(["column", "window"])

MemoryError: Unable to allocate 1.00 MiB for an array with shape (65536,) and data type complex128

In [12]:
import pandas as pd
from functools import reduce

# Not converting directly to int32 since there are float & NaN which causes problem
user_logs = pd.read_csv(
    "../data/raw/user_logs.csv",
    usecols=[
        "msno", "date",
        "num_25", "num_50", "num_75",
        "num_985", "num_100",
        "num_unq", "total_secs"
    ],
    dtype={
        "msno": "category",
        "num_25": "Int16",
        "num_50": "Int16",
        "num_75": "Int16",
        "num_985": "Int16",
        "num_100": "Int16",
        "num_unq": "Int16",
        "total_secs": "float32"
    },
    low_memory=True
)

# Using this additional step since there could be float, NaN in above for total_secs and we are now cleaning before converting to int32
user_logs["total_secs"] = (
    user_logs["total_secs"]
    .fillna(0)
    .round()
    .astype("int32")
)


user_logs["date"] = pd.to_datetime(
    user_logs["date"], format="%Y%m%d", cache=True
)

# Taken from outut of previous notebook
FINAL_T = pd.to_datetime('2016-08-23')

# Defining time interval for immediate/mid/long term periods in form of double-valued dictionary
windows = {
    "recent": (FINAL_T - pd.Timedelta(days=30), FINAL_T),
    "mid":    (FINAL_T - pd.Timedelta(days=90), FINAL_T - pd.Timedelta(days=60)),
    "long":   (FINAL_T - pd.Timedelta(days=180), FINAL_T - pd.Timedelta(days=150)),
}


def aggregate_column_across_windows(logs, column, windows):
    feats = []

    for wname, (start, end) in windows.items():
        temp = logs[
            (logs["date"] >= start) &
            (logs["date"] < end)
        ]

        agg = (
            temp
            .groupby("msno", observed=True)[column]
            .sum()
            .rename(f"{wname}_{column}")
            .reset_index()
        )

        feats.append(agg)

    out = reduce(
        lambda l, r: l.merge(r, on="msno", how="outer"),
        feats
    )
    out.fillna(0, inplace=True)

    return out

# Name of columns in user_logs
log_cols = [
    "num_25", "num_50", "num_75",
    "num_985", "num_100",
    "num_unq", "total_secs"
]

univariate = []

for col in log_cols:
    feats = aggregate_column_across_windows(user_logs, col, windows)
    temp = baseline[["msno", "is_churn"]].merge(feats, on="msno", how="left").fillna(0)

    for w in windows.keys():
        f = f"{w}_{col}"
        med = temp.groupby("is_churn")[f].median()

        univariate.append({
            "column": col,
            "window": w,
            "median_churn": med.loc[1],
            "median_nonchurn": med.loc[0],
            "ratio": (med.loc[0] + 1e-6) / (med.loc[1] + 1e-6)
        })


uni_df = pd.DataFrame(univariate)
uni_df.sort_values(["column", "window"])

MemoryError: Unable to allocate 1.00 MiB for an array with shape (65536,) and data type complex128

In [None]:
import pandas as pd
import numpy as np


# Taken from outut of previous notebook
FINAL_T = pd.to_datetime('2016-08-23')

# Loading the Feature window transactions dataset obtained from previous week work
baseline = pd.read_csv(
    "F:/AI Project/churn-retention-platform/data/processed/dataset_week3_pathA_final.csv"
)


user_logs = pd.read_csv(
    "F:/AI Project/churn-retention-platform/data/raw/user_logs.csv",
    usecols=[
        "msno", "date",
        "num_25", "num_50", "num_75",
        "num_985", "num_100",
        "num_unq", "total_secs"
    ]
)


user_logs["date"] = pd.to_datetime(
    user_logs["date"], format="%Y%m%d"
)


def assign_window(d):
    if FINAL_T - pd.Timedelta(days=30) <= d < FINAL_T:
        return "recent"
    elif FINAL_T - pd.Timedelta(days=90) <= d < FINAL_T - pd.Timedelta(days=60):
        return "mid"
    elif FINAL_T - pd.Timedelta(days=180) <= d < FINAL_T - pd.Timedelta(days=150):
        return "long"
    else:
        return "ignore"


# Adding new column & categorizing dates into recent/mid/long
user_logs["window"] = user_logs["date"].apply(assign_window)


# Removing rows from non-window timeframe
user_logs = user_logs[user_logs["window"] != "ignore"]


# GroupBy msno & window to then sum 7 attributes across this msno*window
# reset_index makes this structure as columns instead of complex multiindex df --> Needs clarification
agg_logs = (
    user_logs
    .groupby(["msno", "window"])
    .agg({
        "num_25": "sum",
        "num_50": "sum",
        "num_75": "sum",
        "num_985": "sum",
        "num_100": "sum",
        "num_unq": "sum",
        "total_secs": "sum"
    })
    .reset_index()
)



behavior_features = agg_logs.pivot(
    index="msno",
    columns="window"
)

# Flatten column names
behavior_features.columns = [
    f"{window}_{col}"
    for col, window in behavior_features.columns
]


# 
behavior_features = behavior_features.reset_index().fillna(0)


#
temp = baseline.merge(
    behavior_features,
    on="msno",
    how="left"
).fillna(0)

summary = []


# 
for col in behavior_features.columns:
    if col == "msno":
        continue

    med = temp.groupby("is_churn")[col].median()

    summary.append({
        "feature": col,
        "median_churn": med.loc[1],
        "median_nonchurn": med.loc[0]
    })

pd.DataFrame(summary)


# 
corr = (
    temp
    .drop(columns=["msno", "is_churn"])
    .corr()
    .abs()
)


# 
week4_dataset = baseline.merge(
    behavior_features,
    on="msno",
    how="left"
).fillna(0)



# 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X = week4_dataset.drop(columns=["msno", "is_churn"])
y = week4_dataset["is_churn"]

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_tr, y_tr)

roc_week4 = roc_auc_score(
    y_val,
    model.predict_proba(X_val)[:, 1]
)

roc_week4
