In [None]:
cd ../..

In [2]:
import datetime
import numpy as np
import pandas as pd

# Settings

In [3]:
DATA_RAW_PATH = "data/raw"
VOLUME_FILE = "gx_volume.csv"
SUBMISSIONS_FILE = "submission_template.csv"
OUTPUT_PATH = "data/features/months_encoded.csv"

# Functions

In [4]:
def month_name_to_number(month_name):
    datetime_object = datetime.datetime.strptime(month_name, "%b")
    return datetime_object.month

def month_number_to_name(month_number):
    return datetime.date(1900, month_number, 1).strftime("%b")

def encode_month(month_name):
    month_number = month_name_to_number(month_name)
    month_sin = np.sin(2 * np.pi * month_number/12)
    month_cos = np.cos(2 * np.pi * month_number/12)
    return month_sin, month_cos

# Retrieve data

In [5]:
volume = pd.read_csv(f"{DATA_RAW_PATH}/{VOLUME_FILE}", index_col=0)
volume.head(2)

Unnamed: 0,country,brand,volume,month_num,month_name
1,country_1,brand_3,18509088.6,-88,Jul
2,country_1,brand_3,19697508.0,-87,Aug


In [6]:
submissions = pd.read_csv(f"{DATA_RAW_PATH}/{SUBMISSIONS_FILE}")
submissions.head(2)

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,,,
1,country_1,brand_121,1,,,


# Create table

## Encode month in volume df

In [7]:
volume = volume[["country", "brand", "month_num", "month_name"]]
volume.head(2)

Unnamed: 0,country,brand,month_num,month_name
1,country_1,brand_3,-88,Jul
2,country_1,brand_3,-87,Aug


In [8]:
volume[["month_sin", "month_coss"]] = pd.DataFrame(list(volume["month_name"].apply(encode_month)), columns=["month_sin", "month_cos"])
volume.head(2)

Unnamed: 0,country,brand,month_num,month_name,month_sin,month_coss
1,country_1,brand_3,-88,Jul,-0.866025,-0.5
2,country_1,brand_3,-87,Aug,-1.0,-1.83697e-16


## Encode month in submissions df

In [9]:
volume_last_month = volume[volume["month_num"] == -1].copy()
volume_last_month = volume_last_month.reset_index(drop=True)
volume_last_month = volume_last_month[["country", "brand", "month_name"]]
volume_last_month = volume_last_month.rename(columns={"month_name": "last_month"})
volume_last_month.head(2)

Unnamed: 0,country,brand,last_month
0,country_1,brand_3,Oct
1,country_1,brand_4,Oct


In [10]:
submissions = submissions[["country", "brand", "month_num"]]
submissions.head(2)

Unnamed: 0,country,brand,month_num
0,country_1,brand_121,0
1,country_1,brand_121,1


In [11]:
submissions = submissions.merge(volume_last_month, on=["country", "brand"], how="left")
submissions["last_month"] = submissions["last_month"].apply(month_name_to_number)
submissions["month_year"] = (submissions["month_num"] + submissions["last_month"]) % 12 + 1
submissions["month_name"] = submissions["month_year"].apply(month_number_to_name)
submissions = submissions.drop(columns=["last_month", "month_year"])
submissions.head(2)

Unnamed: 0,country,brand,month_num,month_name
0,country_1,brand_121,0,Jun
1,country_1,brand_121,1,Jul


In [12]:
submissions[["month_sin", "month_coss"]] = pd.DataFrame(list(submissions["month_name"].apply(encode_month)), columns=["month_sin", "month_cos"])
submissions.head(2)

Unnamed: 0,country,brand,month_num,month_name,month_sin,month_coss
0,country_1,brand_121,0,Jun,1.224647e-16,-1.0
1,country_1,brand_121,1,Jul,-0.5,-0.866025


# Create complete dataset

In [13]:
dataset = pd.concat([volume, submissions])
dataset.head(2)

Unnamed: 0,country,brand,month_num,month_name,month_sin,month_coss
1,country_1,brand_3,-88,Jul,-0.866025,-0.5
2,country_1,brand_3,-87,Aug,-1.0,-1.83697e-16


In [14]:
dataset = dataset.sort_values(["country", "brand", "month_num"]).reset_index(drop=True)
dataset.head(2)

Unnamed: 0,country,brand,month_num,month_name,month_sin,month_coss
0,country_1,brand_10,-47,Jul,-0.866025,-0.5
1,country_1,brand_10,-46,Aug,-1.0,-1.83697e-16


# Save dataset

In [15]:
dataset.head(10)

Unnamed: 0,country,brand,month_num,month_name,month_sin,month_coss
0,country_1,brand_10,-47,Jul,-0.8660254,-0.5
1,country_1,brand_10,-46,Aug,-1.0,-1.83697e-16
2,country_1,brand_10,-45,Sep,-0.8660254,0.5
3,country_1,brand_10,-44,Oct,-0.5,0.8660254
4,country_1,brand_10,-43,Nov,-2.449294e-16,1.0
5,country_1,brand_10,-42,Dec,0.5,0.8660254
6,country_1,brand_10,-41,Jan,0.8660254,0.5
7,country_1,brand_10,-40,Feb,1.0,6.123234000000001e-17
8,country_1,brand_10,-39,Mar,0.8660254,-0.5
9,country_1,brand_10,-38,Apr,0.5,-0.8660254


In [16]:
dataset.to_csv(OUTPUT_PATH, index=False)