# Exploratory Data Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import sys, os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
from eda import *


ImportError: cannot import name 'App_Logger' from 'app_logger' (C:\Users\dell\anaconda3\lib\site-packages\app_logger\__init__.py)

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

## Understanding the dataset

In [None]:
# create a FileHandler object
file_handler = FileHandler()

In [None]:
# reading the store csv file
store_df = file_handler.read_csv("../data/store.csv")
store_df.head(10)

In [None]:
# reading the sales training csv file
train_df = file_handler.read_csv("../data/train.csv")
train_df.head(10)

## General Statistics

In [None]:
descriptive_stats(store_df, size=True)

In [None]:
descriptive_stats(store_df, info=True)

In [None]:
descriptive_stats(store_df, describe=True)

### Train Dataset

In [None]:
descriptive_stats(train_df, size=True)

In [None]:
descriptive_stats(train_df, info=True)

In [None]:
descriptive_stats(train_df, describe=True)

## Handling missing values

### Store dataset

In [None]:
percent_missing_values(store_df)

In [None]:
missing_df = missing_values_table(store_df)

In [None]:
missing_df

 It shows that the first 3 columns and the next 2 columns have the same number of missing values. 

In [None]:
# number of rows with missing values for the whole dataset
count_missing_rows(store_df)

In [None]:
# number of rows with missing values for Promo2SinceWeek, Promo2SinceYear, and PromoInterval
count_missing_rows(store_df[['Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']])

In [None]:
# number of rows with missing values for group 2 (CompetitionOpenSinceMonth, and CompetitionOpenSinceYear)
count_missing_rows(store_df[['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']])

Each columuns in these groups have missing values in the same rows as their groupmates. These means the missing values in these groups appear at the same time as a cluster. Therefore, the values of the columns in each group share a common information.

In [None]:
# dataframe containing the missing rows for columns in group 1
group1_df = store_df[store_df['Promo2SinceWeek'].isna()]
group1_df.head(10)

We can see that the Promo2 column is 0 for the first 10 rows. From intution we can infer that if there is no promotion the values for the columns in Promo2SinceWeek, Promo2SinceYear, and PromoInterval should be null.

Therefore, we can impute the missing values in the columns Promo2SinceWeek and Promo2SinceYear with 0 to convey the meaning of absence since year and week can't be 0. But as for PromoInterval further investigation is needed since its data type is object.

In [None]:
# check the exact data type of the object
pd.api.types.infer_dtype(store_df['PromoInterval'])

In [None]:
# all the unique values 
store_df['PromoInterval'].value_counts()

For the column PromoInterval, we will impute it with '0,0,0,0' as the other values follow list-type format containing four months. So inorder to impute these columns we should first investigate if the values we will insert exist in each column as this will oppose our intention.

In [None]:
store_df[store_df['Promo2SinceWeek'] == 0].shape

In [None]:
store_df[store_df['Promo2SinceYear'] == 0].shape

In [None]:
store_df[store_df['PromoInterval'] == '0,0,0,0'].shape

Since the values don't appear in these columns, let's fill the missing values by '0'.

In [None]:
fix_missing_value(store_df, ['Promo2SinceWeek', 'Promo2SinceYear'], 0)

In [None]:
fix_missing_value(store_df, ['PromoInterval'], '0,0,0,0')

In [None]:
# dataframe containing the missing rows for columns in the second group
group2_df = store_df[store_df['CompetitionOpenSinceMonth'].isna()]
group2_df.head(10)

In [None]:
# all unique values for each column in the dataframe
unique_values_df(group2_df)

Since there is no pattern between each columns the conclusion would be that CompetitionOpenSinceMonth and CompetitionOpenSinceYear are null because there was already a competition when the store was opened. So let's create another column called CompetitionBeforeStoreOpened which holds values of 0 and 1, 0 indicating the absence of competition at the time the store was opened and 1 indicating the presence of competition at the time the store was opened.

In [None]:
# deriving new column
store_df['CompetitionBeforeStoreOpened'] = store_df['CompetitionOpenSinceYear'].apply(lambda x: 1 if np.isnan(x) else 0)
store_df

Impututing the missing values in the column CompetitionOpenSinceYear and CompetitionOpenSinceMonth with its minimun value and 1 respectively. 

In [None]:
min_year = store_df['CompetitionOpenSinceYear'].min()
min_year

In [None]:
# impute the column CompetitionOpenSinceYear with 1900 and the column CompetitionOpenSinceMonth with 1
fix_missing_value(store_df, ['CompetitionOpenSinceYear'], min_year)
fix_missing_value(store_df, ['CompetitionOpenSinceMonth'], 1)

Handling missing values of column CompetitionDistance

In [None]:
temp_df = store_df[store_df['CompetitionDistance'].isna()]
temp_df

In [None]:
unique_values_df(temp_df)

All rows with missing values in the column CompetitionDistance has a value of 1 in the column CompetitionBeforeStoreOpened. This is because the competitions for these stores are very far away and have almost no impact on these stores that they weren't measured when collecting the data. For this reason, I will impute thess values with the maximum competition distance.

In [None]:
max_dist = store_df['CompetitionDistance'].max()
max_dist


In [None]:
fix_missing_value(store_df, ['CompetitionDistance'], max_dist)

In [None]:
# final check for missing values
percent_missing_values(store_df)

### Train dataset

In [None]:
percent_missing_values(train_df)

## Handling data types

In [None]:
store_df.dtypes

In [None]:
# get the columns with object data type
string_columns = store_df.select_dtypes(include='object').columns.tolist()
string_columns

In [None]:
convert_to_string(store_df, string_columns)

In [None]:
#Converting the columns to int as they cannot be floats.
convert_to_int(store_df, ['CompetitionOpenSinceMonth',  'CompetitionOpenSinceYear',
        'Promo2SinceWeek', 'Promo2SinceYear'])

In [None]:
store_df.dtypes

In [None]:
#Checking if there is mixed data type in the train dataset.
show_cols_mixed_dtypes(train_df)

In [None]:
train_df['StateHoliday'].value_counts()

In [None]:
train_df['StateHoliday'].value_counts().index

In [None]:
convert_to_string(train_df, ['StateHoliday'])

In [None]:
train_df.dtypes

In [None]:
train_df.head()

In [None]:
convert_to_datetime(train_df, ['Date'])

In [None]:
train_df.dtypes

## Handling duplicates

In [None]:
# search for duplicate rows and drop them
drop_duplicates(store_df)

In [None]:
# search for duplicate rows and drop them
drop_duplicates(train_df)

## Handling Date column

In [None]:
train_df['Year'] = train_df['Date'].apply(lambda x: x.year)
train_df['Month'] = train_df['Date'].apply(lambda x: x.month)
train_df['DayOfMonth'] = train_df['Date'].apply(lambda x: x.day)
train_df['WeekOfYear'] = train_df['Date'].apply(lambda x: x.weekofyear)
train_df['weekday'] = train_df['DayOfWeek'].apply(lambda x: 0 if (x in [6, 7]) else 1)

In [None]:
train_df.sample(10)

In [None]:
def getMonth(month_list, index):
    months = ['0', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
    month_list = month_list.split(',')
    month = month_list[index]
    return months.index(month)

In [None]:
# split the PromoInterval column into 4 columns
store_df['PromoInterval0'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 0)))
store_df['PromoInterval1'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 1)))
store_df['PromoInterval2'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 2)))
store_df['PromoInterval3'] = store_df.PromoInterval.apply((lambda x: getMonth(x, 3)))

In [None]:
store_df.sample(10)

In [None]:
# save the clean dataframe to a csv file
file_handler.to_csv(train_df, '../data/eda/train.csv')
file_handler.to_csv(store_df, '../data/eda/store.csv')