In [None]:
cd ..

In [None]:
import datetime
import numpy as np
import pandas as pd

# Settings

In [None]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
SUBMISSIONS_FILE = "submission_template.csv"
OUTPUT_PATH = "data/features/dataset_months_encoded.csv"

# Functions

In [None]:
def month_name_to_number(month_name):
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    return datetime_object.month

def month_number_to_name(month_number):
    return datetime.date(1900, month_number, 1).strftime("%b")

def encode_month(month_name):
    month_number = month_name_to_number(month_name)
    month_sin = np.sin(2 * np.pi * month_number/12)
    month_cos = np.cos(2 * np.pi * month_number/12)
    return month_sin, month_cos

# Retrieve data

In [None]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

In [None]:
submissions = pd.read_csv(f"{DATA_RAW_PATH}/{SUBMISSIONS_FILE}")
submissions.head(2)

# Create table

## Encode month in volume df

In [None]:
volume = volume[["country", "brand", "month_num", "month_name"]]
volume.head(2)

In [None]:
volume[["month_sin", "month_coss"]] = pd.DataFrame(list(volume["month_name"].apply(encode_month)), columns=["month_sin", "month_cos"])
volume.head(2)

## Encode month in submissions df

In [None]:
volume_last_month = volume[volume["month_num"] == -1].copy()
volume_last_month = volume_last_month.reset_index(drop=True)
volume_last_month = volume_last_month[["country", "brand", "month_name"]]
volume_last_month = volume_last_month.rename(columns={"month_name": "last_month"})
volume_last_month.head(2)

In [None]:
submissions = submissions[["country", "brand", "month_num"]]
submissions.head(2)

In [None]:
submissions = submissions.merge(volume_last_month, on=["country", "brand"], how="left")
submissions["last_month"] = submissions["last_month"].apply(month_name_to_number)
submissions["month_year"] = (submissions["month_num"] + submissions["last_month"]) % 12 + 1
submissions["month_name"] = submissions["month_year"].apply(month_number_to_name)
submissions = submissions.drop(columns=["last_month", "month_year"])
submissions.head(2)

In [None]:
submissions[["month_sin", "month_coss"]] = pd.DataFrame(list(submissions["month_name"].apply(encode_month)), columns=["month_sin", "month_cos"])
submissions.head(2)

# Create complete dataset

In [None]:
dataset = pd.concat([volume, submissions])
dataset.head(2)

In [None]:
dataset = dataset.sort_values(["country", "brand", "month_num"]).reset_index(drop=True)
dataset.head(2)

# Save dataset

In [None]:
dataset.head(10)

In [None]:
dataset.to_csv(OUTPUT_PATH, index=False)