# Data preparation before import

In [None]:
import os
import warnings

import pandas as pd

warnings.filterwarnings("ignore")

In [None]:
CLEANED_DATA_PATH = "../data/cleaned"
MONGO_DATA_PATH = "../data/mongo"
POSTGRES_DATA_PATH = "../data/postgres"

## Load data

In [75]:
def load_df(name: str) -> pd.DataFrame:
    return pd.read_csv(os.path.join(CLEANED_DATA_PATH, f"{name}.csv"))

## Save data

In [76]:
def save_df_json(name: str, dataset: pd.DataFrame, path: str = MONGO_DATA_PATH):
    if not os.path.exists(path):
        os.makedirs(path)

    dataset.to_json(
        os.path.join(path, f"{name}.json"),
        index=False,
        orient="records",
        lines=True,
    )


def save_df(name: str, dataset: pd.DataFrame, path: str = POSTGRES_DATA_PATH):
    if not os.path.exists(path):
        os.makedirs(path)

    dataset.to_csv(os.path.join(path, f"{name}.csv"), index=False)

## Preparation

### MongoDB

In [77]:
def generate_mongo():
    campaigns_df = load_df("campaigns")
    client_purchase_df = load_df("client_first_purchase_date")
    events_df = load_df("events")
    friends_df = load_df("friends")
    messages_df = load_df("messages")

    # Campaign
    campaigns_df["campaign_id"] = campaigns_df["id"].astype(str)
    campaigns_df.drop(columns=["id"], inplace=True)
    save_df_json("campaigns", campaigns_df)

    # Clients
    clients_to_str_columns = ["client_id", "user_id"]
    client_purchase_df[clients_to_str_columns] = client_purchase_df[
        clients_to_str_columns
    ].astype(str)
    save_df_json("client_first_purchase_date", client_purchase_df)

    # Events
    events_to_str_columns = ["product_id", "category_id", "user_id"]
    events_df[events_to_str_columns] = events_df[events_to_str_columns].astype(str)
    save_df_json("events", events_df)

    # Friends
    friends_to_str_columns = ["friend1", "friend2"]
    friends_df[friends_to_str_columns] = friends_df[friends_to_str_columns].astype(str)
    save_df_json("friends", friends_df)

    # Messages
    messages_to_str_columns = ["client_id", "campaign_id", "user_id"]
    messages_df[messages_to_str_columns] = messages_df[messages_to_str_columns].astype(
        str
    )
    save_df_json("messages", messages_df)


# generate_mongo()

### PostgreSQL

In [80]:
def generate_postgres():
    campaigns_df = load_df("campaigns")
    client_purchase_df = load_df("client_first_purchase_date")
    events_df = load_df("events")
    friends_df = load_df("friends")
    messages_df = load_df("messages")

    # Products
    print("Building 'products'...")
    products_df = (
        events_df[["product_id", "category_id", "category_code", "brand", "price"]]
        .drop_duplicates()
        .groupby(["product_id"], as_index=False)
        .agg(
            {
                "brand": "first",
                "price": "first",
                "category_id": "first",
                "category_code": "first",
            }
        )
    )
    save_df("products", products_df)

    # Friends
    print("Building 'friends'...")
    save_df("friends", friends_df)

    # Clients
    print("Building 'clients'...")
    clients_df = (
        pd.merge(
            client_purchase_df,
            messages_df[
                ["client_id", "user_id", "user_device_id", "email_provider"]
            ].drop_duplicates(),
            on=["client_id", "user_id", "user_device_id"],
            how="outer",
        )
        .groupby(["client_id", "user_id", "user_device_id"], as_index=False)
        .agg({"first_purchase_date": "first", "email_provider": "first"})
    )
    save_df("clients", clients_df)

    # Events
    print("Building 'events'...")
    save_df(
        "events",
        events_df.drop(columns=["category_id", "category_code", "brand"]),
    )

    # Campaign
    print("Building 'campaigns'...")
    save_df(
        "campaigns",
        campaigns_df.rename(columns={"id": "campaign_id"}).astype(
            {
                int_column: pd.Int64Dtype()
                for int_column in [
                    "total_count",
                    "position",
                    "hour_limit",
                    "subject_length",
                ]
            }
        ),
    )

    # Messages
    print("Building 'messages'...")
    save_df(
        "messages",
        messages_df.drop(columns=["id", "email_provider", "user_id", "user_device_id"]),
    )


generate_postgres()

Building 'products'...
Building 'friends'...
Building 'clients'...
Building 'events'...
Building 'campaigns'...
Building 'messages'...
