In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
# Cargamos los dataframes
df_store = pd.read_csv('../data/store.csv')
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [None]:
# Analizamos el dataframe de train
df_train.dtypes

In [None]:
df_train.head(5)

In [None]:
def apply_date_format(df):
    df['Date'] = pd.to_datetime(df['Date'])
    date_aux = df['Date']

    df['year'] = date_aux.dt.year
    df['month'] = date_aux.dt.month
    df['week'] = date_aux.dt.isocalendar().week
    df['year_month'] = date_aux.dt.strftime('%Y-%m')
    df['year_week'] = date_aux.dt.strftime('%Y-%V')

    return df

In [None]:
df_train = apply_date_format(df=df_train)

In [None]:
# Resumen de las variables
round(df_train.describe(),2)

In [None]:
nan_by_col = []
for col in df_train.columns:
    nan_by_col.append({
        'column': col,
        'nan_count': df_train[col].isna().sum()
    })

pd.DataFrame(nan_by_col)

In [None]:
store_sales = df_train[['Store','Date']].groupby('Store').count()
print('Cantidad Mínima de puntos:', store_sales.Date.min())
print('Cantidad Máxima de puntos:', store_sales.Date.max())

In [None]:
under_942 = store_sales[store_sales['Date'] < 942]
print(f"Cantidad de tiendas con Menos de 942 días: {len(under_942)}")

In [None]:
print(f'Cantidad de tiendas: {len(df_train.Store.unique())}')

In [None]:
print(f'Cantidad de días: {len(df_train.Date.unique())}')
print(f'Minimo de días: {df_train.Date.min()}')
print(f'Maximo de días: {df_train.Date.max()}')

In [None]:
def plot_sales_time_series(df_to_plot, grouping_col):
    x_y_cols = ['Sales', grouping_col]
    df_aux = df_to_plot[x_y_cols].groupby(grouping_col).sum().reset_index()
    plt.plot(df_aux[grouping_col], df_aux.Sales)
    plt.title(f'Sales plot grouped by {grouping_col}')
    plt.show()

In [None]:
plot_sales_time_series(
    df_to_plot=df_train,
    grouping_col='Date'
)

In [None]:
plot_sales_time_series(
    df_to_plot=df_train,
    grouping_col='year_week'
)

In [None]:
plot_sales_time_series(
    df_to_plot=df_train,
    grouping_col='year_month'
)

In [None]:
# Visualizamos las ventas para una unica tienda seleccionada
store = 1
df_selected_store = df_train.loc[df_train['Store'] == store]

plot_sales_time_series(
    df_to_plot=df_selected_store,
    grouping_col='year_month'
)

plot_sales_time_series(
    df_to_plot=df_selected_store,
    grouping_col='year_week'
)


In [None]:
# Visualizamos las ventas para una unica tienda seleccionada
# solo para las fechas en las que estuvo abierta
df_selected_store_open = df_selected_store.loc[df_train['Open'] == 1]
plot_sales_time_series(
    df_to_plot=df_selected_store,
    grouping_col='Date'
)

In [None]:
# Vemos la distribución de la serie con un histograma
df_selected_store_open.Sales.hist()

In [None]:
# Vemos los números más representativos con un Box Plot
plt.boxplot(df_selected_store_open.Sales)

In [None]:
pd.DataFrame(df_selected_store_open.Sales.describe())

In [None]:
def plot_rolling_average_sales(df_to_plot, time_window: int):
    rolling_avg_series = np.convolve(
        df_to_plot.Sales,
        np.ones(time_window)/time_window,
        mode='same'
    )

    # Daily sales
    plt.plot(df_to_plot.Date, df_to_plot.Sales)
    # Rolling Average sales
    plt.plot(df_to_plot.Date, rolling_avg_series)

    plt.title(f'Rolling Average Mean for {time_window} days time window')
    plt.show()

In [None]:
plot_rolling_average_sales(
    df_to_plot=df_selected_store_open,
    time_window=7
)

In [None]:
plot_rolling_average_sales(
    df_to_plot=df_selected_store_open,
    time_window=14
)


In [None]:
plot_rolling_average_sales(
    df_to_plot=df_selected_store_open,
    time_window=30
)


In [None]:
plot_rolling_average_sales(
    df_to_plot=df_selected_store_open,
    time_window=60
)


In [None]:
# Ventas y Clientes
plt.plot(df_selected_store_open.Date, df_selected_store_open.Sales)
plt.plot(df_selected_store_open.Date, df_selected_store_open.Customers)

plt.title(f'Sales & Customers time series')
plt.show()


In [None]:
plt.scatter(df_selected_store_open.Customers, df_selected_store_open.Sales)
plt.title(f'Sales & Customers scatter plot')
plt.show()

In [None]:
def plot_multiple_sales_time_series(df_to_plot, segmentation_col: str, grouping_col: str):
    x_y_cols = ['Sales', grouping_col]

    for segment in df_train[segmentation_col].unique():
        df_aux = df_to_plot.loc[df_to_plot[segmentation_col]==segment, x_y_cols].groupby(grouping_col).sum().reset_index()
        plt.plot(df_aux[grouping_col], df_aux.Sales)
        plt.ylim(ymin=0)

    plt.title(f'Sales Time Series for {segmentation_col} grouped by {grouping_col}')
    plt.show()

In [None]:
plot_multiple_sales_time_series(
    df_to_plot=df_train,
    segmentation_col='year',
    grouping_col='month'
)

In [None]:
plot_multiple_sales_time_series(
    df_to_plot=df_train,
    segmentation_col='year',
    grouping_col='week'
)

In [None]:
plot_multiple_sales_time_series(
    df_to_plot=df_train[df_train['Open']==1],
    segmentation_col='week',
    grouping_col='DayOfWeek'
)

In [None]:
plot_multiple_sales_time_series(
    df_to_plot=df_train[df_train['Open']==1],
    segmentation_col='year_week',
    grouping_col='DayOfWeek'
)

In [None]:
df_train.groupby('year')['year'].count()

In [None]:
df_train.groupby('year_month')['year_month'].count()

In [None]:
df_train.groupby('year_week')['year_week'].count()

In [None]:
df_test = apply_date_format(df=df_test)

In [None]:
df_test.groupby('year')['year'].count()

In [None]:
df_test.groupby('month')['month'].count()

In [None]:
df_test.groupby('year_month')['year_month'].count()

In [None]:
df_test.groupby('year_week')['year_week'].count()