In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LEITURA DE DADOS

In [None]:
df_store = pd.read_parquet('../../data/raw/part-00000-tid-2779033056155408584-f6316110-4c9a-4061-ae48-69b77c7c8c36-4-1-c000.snappy.parquet', engine="fastparquet")
df_transaction = pd.read_parquet('../../data/raw/part-00000-tid-5196563791502273604-c90d3a24-52f2-4955-b4ec-fb143aae74d8-4-1-c000.snappy.parquet', engine="fastparquet")
df_product = pd.read_parquet('../../data/raw/part-00000-tid-7173294866425216458-eae53fbf-d19e-4130-ba74-78f96b9675f1-4-1-c000.snappy.parquet', engine="fastparquet")
df_zipcode = pd.read_csv('../../data/raw/georef-zipcode.csv', sep=';')

## UNIFIYNG DATASETS

In [None]:
df = df_transaction.set_index("internal_store_id").join(
    other=df_store.set_index("pdv"),
    how='left'
).reset_index().set_index("internal_product_id").join(
    other=df_product.set_index("produto"),
    how='left'
).reset_index()

## INITIAL IDEAS

1. Create a unique identifier for `internal_store_id`, `internal_product_id`, and `week (transaction_date.dt.week);
2. Target: `quantity`;
3. Columns not to use:
- `net_value` or `taxes`;
- `description`.
4. Drop negative values from `quantity` and rounding float values;
5. Round float values from `quantity`;
10. Important notes:
- Use `types` instead of `category`, as it is a column that has the same categories with some differences that make alcoholic products a little more specific, which can influence sales.
- Analyze the correlation matrix focused on the gross/net profit variables.
- 667 internal_store_ids are not in df_store. If we use them, we must fill the NaN columns as Unknown for the characteristics of this POS in the preprocessor within our pipeline.
- To use zipcode, remember to create segments and categorize the column.

## DATA WRANGLING

In [None]:
df.groupby('transaction_date').size().reset_index(name='counts').sort_values(by='counts')

### Dropping negative and rounding values from quantity

In [None]:
df = df[df.quantity >= 0]
df['quantity'] = df['quantity'].round().astype(int)

### Dropping some columns

In [None]:
df.drop(columns=['taxes','categoria','descricao','reference_date'], inplace=True)

### Transform temporal features

In [None]:
df.loc[:,'transaction_date'] = pd.to_datetime(df['transaction_date'])
df.loc[:,'day_of_week'] = df.loc[:,'transaction_date'].dt.dayofweek
df.loc[:,'month'] = df.loc[:,'transaction_date'].dt.month
df.loc[:,'week_of_year'] = df.loc[:,'transaction_date'].dt.isocalendar().week

### Finding city by zipcode

In [None]:
df_zipcode.rename(columns={'Zip Code': 'zipcode', 'Official USPS city name': 'city'}, inplace=True)
df = df.merge(df_zipcode.loc[:, ['zipcode','city']], on='zipcode', how='left')

## FEATURE ENGINEERING

### Creating Average number of skus purchased per city month

In [None]:
# Creating Average number of skus purchased per city and month
df_dict_month = df.loc[:,['internal_product_id', 'city', 'month', 'quantity']]\
    .groupby(['internal_product_id', 'city', 'month'])\
    .quantity\
    .mean()\
    .reset_index(name = 'AVG_SALES_CITY_MONTH')

# Merging average dict for each month as a feature
df = df.merge(right=df_dict_month, on=['internal_product_id', 'city', 'month'], how='left')

### Creating Average number of skus purchased per city week_of_year

In [None]:
# Creating Average number of skus purchased per city and week_of_year
df_dict_week_of_year = df.loc[:,['internal_product_id', 'city', 'week_of_year', 'quantity']]\
    .groupby(['internal_product_id', 'city', 'week_of_year'])\
    .quantity\
    .mean()\
    .reset_index(name = 'AVG_SALES_CITY_WEEK_OF_YEAR')

# merfing average dict for each week_of_year as a feature
df = df.merge(right=df_dict_week_of_year, on=['internal_product_id', 'city', 'week_of_year'], how='left')

### Creating Average number of skus purchased per city day_of_week

In [None]:
# Creating Average number of skus purchased per city and day_of_week
df_dict_day_of_week = df.loc[:, ['internal_product_id', 'city', 'day_of_week', 'quantity']]\
    .groupby(['internal_product_id', 'city', 'day_of_week'])\
    .quantity\
    .mean()\
    .reset_index(name = 'AVG_SALES_CITY_DAY_OF_WEEK')

# Merging average dict for each day_of_week as a feature
df = df.merge(right=df_dict_day_of_week, on=['internal_product_id', 'city', 'day_of_week'], how='left')

## SAVING THE DATA

In [None]:
df.to_parquet('../../data/processed/processed_data.parquet', engine='fastparquet', index=False)