In [1]:
import pandas as pd
import numpy as np
from itertools import chain


In [2]:
customers = pd.read_csv("archive/customers.csv.zip")[["customer_id", "age"]]

In [3]:
df = pd.read_csv("archive/transactions_train.csv.zip", dtype={"article_id": str}, parse_dates=["t_dat"])

df = df.drop(["price", "sales_channel_id"], axis=1)

In [4]:
def reduce_mem_usage(df):
    """iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.select_dtypes(exclude=[np.datetime64]).columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
customers = reduce_mem_usage(customers)
df = reduce_mem_usage(df)

Memory usage of dataframe is 20.93 MB
Memory usage after optimization is: 13.08 MB
Decreased by 37.5%
Memory usage of dataframe is 727.58 MB
Memory usage after optimization is: 727.58 MB
Decreased by 0.0%


In [6]:
test_df = pd.read_csv('archive/sample_submission.csv').drop("prediction", axis=1)


In [7]:
print("Max `t_dat`:", df["t_dat"].max())
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
n_classes = active_articles.shape[0] + 1
active_articles.shape, n_classes

Max `t_dat`: 2020-09-22 00:00:00


((72581, 3), 72582)

In [8]:
# restrict the entries to articles that has appearance after 2019-01-01
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

(29634404, 3)

In [9]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
print(df["week"].nunique())

105


In [10]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
customers.age = customers.age.fillna(0)
labels = ["-1:19","19:29", "29:39", "39:49", "49:59", "59:69", "69:119"]
customers['age_bins'] = pd.cut(customers['age'], listBin, labels=labels)


In [11]:
df = df.merge(customers[["customer_id","age_bins"]], on='customer_id', how='inner')

In [12]:
item_counts = []

for i, age in enumerate(labels):
    print(i,age)
    item_counts.append(df[(df.week < 2) & (df.age_bins == age)].article_id.value_counts())


0 -1:19
1 19:29
2 29:39
3 39:49
4 49:59
5 59:69
6 69:119


In [13]:
d = {}
most_frequent_items = []
for i, age in enumerate(labels):

    most_frequent_items = item_counts[i][:12].index.to_numpy().astype(str).astype(int)
    d[age] =  " ".join(map("{:010d}".format, most_frequent_items)) 


In [14]:
test_df = test_df.merge(customers[['customer_id', 'age_bins']], on = "customer_id", how='inner')

In [15]:
test_df["prediction"] = test_df["age_bins"].map(d)

In [16]:
test_df[["customer_id", "prediction"]].to_csv("submission_age.csv.gz", compression="gzip", index=False)

# result = 0.0064

In [17]:
df_tmp = df[df["week"] <= 7].sort_values(["customer_id", "t_dat"], ascending=False)


In [18]:
def keep_latest_k(articles, k=12):
    result = []
    # Use most commonly bought items to fill the empty spots
    for item in chain(articles, most_frequent_items):
        if item in result:
            continue
        result.append(item)
        if len(result) == k:
            break
    return result

In [19]:
df_latest_items = df_tmp.groupby("customer_id").agg({"article_id": keep_latest_k}).reset_index()
df_latest_items["prediction"] = df_latest_items.article_id.apply(lambda x: " ".join(map("{:010d}".format, x)))
df_latest_items.head()

ValueError: Unknown format code 'd' for object of type 'str'

In [None]:
test_df = test_df.drop('prediction', axis=1).merge(
    df_latest_items[["customer_id", "prediction"]], 
    how="left", on="customer_id")
test_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,


In [None]:
test_df["prediction"] = test_df["prediction"].fillna(prediction_str)
test_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0918522001 0751471001 0918292001 0448509014 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0918522001 0751471001 0918292001 0448509014 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0918522001 0751471001 0918292001 0448509014 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0751471001 0918292001 0448509014 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0918522001 0751471001 0918292001 0448509014 09...


In [None]:
test_df[["customer_id", "prediction"]].to_csv("submission_recently_purchased.csv.gz", compression="gzip", index=False)