# Hi!35 - full code for SupplAi.

## Compact version

### Generating the data sets.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

In [2]:
train_data = pd.read_csv("./data/train-data.csv", sep=";")
X_set = pd.read_csv("./data/X_test.csv", sep=";")
GSCPI_data = pd.read_csv("./data/GSCPI_data.csv")

In [3]:
train_data.drop_duplicates("index", inplace=True)
fill_data = pd.concat([train_data, X_set])
date_features = ['Date','Month 1', 'Month 2', 'Month 3', 'Month 4']
product_sales = fill_data.groupby('id_product')[date_features].agg(list)

In [4]:
months_features = np.vectorize(lambda i: f"Month {i}")(np.arange(1, 37))
chrono_quarter = {
    "sep-dec 2020": 0,
    "jan-apr 2021": 4,
    "may-aug 2021": 8,
    "sep-dec 2021": 12,
    "jan-apr 2022": 16,
    "may-aug 2022": 20,
    "sep-dec 2022": 24,
    "jan-apr 2023": 28,
    "may-jul 2023": 32,
}

def compute_sales(row):
    dates, values = [], []
    for i,d in enumerate(row['Date']):
        p = chrono_quarter[d]
        dates.extend([months_features[p+j] for j in range(4)])
        values.extend([row[f"Month {j+1}"][i] for j in range(4)])
    tmp = np.array(sorted(zip(dates, values)))
    dates = tmp[:,0]
    values = tmp[:,1]
    return dates, values

applied = product_sales.progress_apply(compute_sales, axis = 1)

  0%|          | 0/244857 [00:00<?, ?it/s]

In [5]:
features = [x[0] for x in applied.values]
values = [x[1] for x in applied.values]
product_sales = pd.DataFrame(values, index=applied.index, columns=features[0])[months_features]

int_parser = lambda s: np.nan if s == 'nan' else "".join(str(s).split())

for f in months_features:
    product_sales[f] = product_sales[f].apply(int_parser).astype(np.float32)

In [6]:
product_features = ['id_product', 'Region', 'Country', 'Site', 'Operations',
       'Zone', 'Cluster', 'Reference proxy', 'Product  Line proxy',
       'Division proxy', 'Customer Persona proxy',
       'Strategic Product Family proxy', 'Product Life cycel status']

product_info = fill_data.drop_duplicates("id_product")[product_features].set_index('id_product')

In [7]:
product_data = product_info.join(product_sales).reset_index()

completed_train_data = (train_data
                        .rename(columns={'Month 4': 'target'})
                        .drop(columns=['Month 1', 'Month 2', 'Month 3'])
                        .set_index('id_product')
                        .join(product_sales)
                        .drop(columns=[f'Month {i}' for i in range(4, 37, 4)])
                        .reset_index()
                        )

completed_train_data['target'] = completed_train_data['target'].apply(int_parser).astype(np.float32)

In [8]:
GSCPI_map = {
    '2021-01': 'jan-apr 2021',
    '2021-02': 'jan-apr 2021',
    '2021-03': 'jan-apr 2021',
    '2021-04': 'jan-apr 2021',
    '2021-05': 'may-aug 2021',
    '2021-06': 'may-aug 2021',
    '2021-07': 'may-aug 2021',
    '2021-08': 'may-aug 2021',
    '2021-09': 'sep-dec 2021',
    '2021-10': 'sep-dec 2021',
    '2021-11': 'sep-dec 2021',
    '2021-12': 'sep-dec 2021',
    '2022-01': 'jan-apr 2022',
    '2022-02': 'jan-apr 2022',
    '2022-03': 'jan-apr 2022',
    '2022-04': 'jan-apr 2022',
    '2022-05': 'may-aug 2022',
    '2022-06': 'may-aug 2022',
    '2022-07': 'may-aug 2022',
    '2022-08': 'may-aug 2022',
    '2022-09': 'sep-dec 2022',
    '2022-10': 'sep-dec 2022',
    '2022-11': 'sep-dec 2022',
    '2022-12': 'sep-dec 2022',
    '2023-01': 'jan-apr 2023',
    '2023-02': 'jan-apr 2023',
    '2023-03': 'jan-apr 2023',
    '2023-04': 'jan-apr 2023',
    '2023-05': 'may-jul 2023',
    '2023-06': 'may-jul 2023',
    '2023-07': 'may-jul 2023',
    '2023-08': 'may-jul 2023',
}

GSCPI_data = GSCPI_data[GSCPI_data["Year-Month"].isin(GSCPI_map.keys())]
GSCPI_data["Date"] = GSCPI_data["Year-Month"].apply(lambda s: GSCPI_map[s])

completed_train_data = (completed_train_data
                        .set_index("Date")
                        .join(GSCPI_data.dropna()
                              .groupby(["Date"])["GSCPI"]
                              .mean()
                              )
                        .reset_index()
                        )

In [10]:
product_data.to_csv("./data/product_data.csv", sep=";")
completed_train_data.to_csv("./data/completed_train_data.csv", sep=";")

Last run : files generated on portable computer in less than 3 minutes and 30 seconds.