# DATASET INTEGRATION

In [8]:
import pandas as pd

df = pd.read_csv('../datasets/customer_shopping_data_v0.csv')

#### v1 = date version is transformed from DD-MM-YY to YYYY-MM-DD

In [9]:
df['invoice_date'] = pd.to_datetime(df['invoice_date'], dayfirst=True)
df.to_csv('../datasets/customer_shopping_data_v1.csv', index=False)

#### v2 = 'season' column is added

In [10]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Null'

df = pd.read_csv('../datasets/customer_shopping_data_v1.csv')
df['season'] = pd.to_datetime(df['invoice_date']).dt.month.apply(get_season)

df.to_csv('../datasets/customer_shopping_data_v2.csv', index=False)
df = pd.read_csv('../datasets/customer_shopping_data_v2.csv')

#### v3 = 'is_weekday' column is added

In [13]:
def is_weekday(day):
    if day < 5:  # 0-4 -> Monday - Friday
        return 1
    else:        # 5â€“6 -> Saturday - Sunday
        return 0

df['is_weekday'] = pd.to_datetime(df['invoice_date']).dt.weekday.apply(is_weekday)

df.to_csv('../datasets/customer_shopping_data_v3.csv', index=False)

#### v4 = 'is_holiday' column is added

In [1]:
import pandas as pd
import holidays

df = pd.read_csv('../datasets/customer_shopping_data_v3.csv')
df['invoice_date'] = pd.to_datetime(df['invoice_date'])

date_range = pd.date_range(start='2021-01-01', end='2023-12-31')
df_dates = pd.DataFrame({'date': date_range})

turkiye_holidays = holidays.Turkey(years=[2021, 2022, 2023])
df_dates['is_official_holiday'] = df_dates['date'].dt.date.isin(set(turkiye_holidays.keys()))
df_dates['is_weekend'] = df_dates['date'].dt.weekday >= 5

manual_full_holidays = [
]
manual_full_holidays = pd.to_datetime(manual_full_holidays).date
df_dates['is_manual_full'] = df_dates['date'].dt.date.isin(manual_full_holidays)

manual_half_holidays = [
]
manual_half_holidays = pd.to_datetime(manual_half_holidays).date
df_dates['is_manual_half'] = df_dates['date'].dt.date.isin(manual_half_holidays)

df_dates['is_holiday'] = 0
df_dates.loc[df_dates['is_manual_half'], 'is_holiday'] = 2
df_dates.loc[df_dates['is_official_holiday'] | df_dates['is_weekend'] | df_dates['is_manual_full'], 'is_holiday'] = 1

df_dates = df_dates[['date', 'is_holiday']]
df = df.merge(df_dates, left_on='invoice_date', right_on='date', how='left')
df.drop(columns=['date'], inplace=True)

df.to_csv('../datasets/customer_shopping_data_v4.csv', index=False)

#### v5 = economic indicators' columns are added by their date equality

In [7]:
import pandas as pd

df_shopping = pd.read_csv("../datasets/customer_shopping_data_v4.csv")
df_index = pd.read_csv("../datasets/economic_confident_indexes_extracted.csv", sep=";")

df_shopping["invoice_date"] = pd.to_datetime(df_shopping["invoice_date"])
df_shopping["Year"] = df_shopping["invoice_date"].dt.year
df_shopping["Month"] = df_shopping["invoice_date"].dt.month

df_merged = pd.merge(df_shopping, df_index, how="left", on=["Year", "Month"])
df_merged.drop(["Year", "Month"], axis=1, inplace=True)

df_merged.to_csv("../datasets/customer_shopping_data_v5.csv", index=False)