# SUPERVISED LEARNING

In [None]:
# Dependent Variable (Target): Weekly_Sales
# Independent Variables : (Features): Store,Type,Size,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1 to MarkDown5,CPI,Unemployment

# READING CSV FILE:

In [None]:
import pandas as pd

In [None]:
store = pd.read_csv('/content/stores_data_set.csv')

In [None]:
sales = pd.read_csv('/content/sales_data_set.csv')

In [None]:
features = pd.read_csv('/content/Features_data_set.csv')

In [None]:
store.columns

In [None]:
sales.columns

In [None]:
features.columns

In [None]:
store.shape

In [None]:
sales.shape

In [None]:
features.shape

In [None]:
# Merge the DataFrames on common columns
df = pd.merge(store, sales, on=['Store'], how='inner')
df = pd.merge(df, features, on=['Store', 'Date'], how='inner')

In [None]:
df.head()

# Data Cleaning

In [None]:
# Drop the redundant column 'IsHoliday_y'
df.drop('IsHoliday_y', axis=1, inplace=True)

In [None]:
df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

In [None]:
df

In [None]:
# checking data type
df.dtypes

In [None]:
# Convert 'Date' column to datetime format

df['Date']  = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

In [None]:
# 'IsHoliday' column will have values of 1 where it was True and 0 where it was False.
df['IsHoliday'] = df['IsHoliday'].astype(int)

In [None]:
# Categorical Feature Counts:
categorical_counts = df['Type'].value_counts()
print(categorical_counts)

In [None]:
# Mapping 'Type' categories to numeric values
type_mapping = {'A': 1, 'B': 2, 'C': 3}
df['Type'] = df['Type'].map(type_mapping)

In [None]:
df.dtypes

# Handling Missing Values:

In [None]:
# Calculate the total number of missing values in each column
missing_values = df.isnull().sum()

# Calculate the percentage of missing values in each column
percentage_missing = (missing_values / len(df)) * 100

# Create a DataFrame to display the results
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': percentage_missing})

print(missing_data)


In [None]:
# Markdown_columns  ---> Markdown refers to promotional discounts or reductions in prices.(weekly deduction for all dept)
# Markup ---> (price increased based on demand)refers to the difference between the cost of a product or service and its selling price.(weekly)

In [None]:
# Filling missing values with zeros,because there is no markdown or markup given by store for that particular week
df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']] = df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].fillna(0)

In [None]:
df.isnull().sum()

# De_duplication

In [None]:
a = df.duplicated().sum()

In [None]:
print("Total Duplicated datapoints:",a)

# EXPLORATORY DATA ANALYSIS(EDA):

In [None]:
df.info()

In [None]:
# Summary Statistics:
summary_stats = df.describe()
print(summary_stats)

In [None]:
# Boxplot for Weekly Sales by Store Type
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.boxplot(x='Type', y='Weekly_Sales', data=df)
plt.xlabel('Store Type')
plt.ylabel('Weekly Sales')
plt.title('Weekly Sales Distribution by Store Type')
plt.show()


In [None]:
import seaborn as sns

# Countplot for Store Types
plt.figure(figsize=(8, 6))
sns.countplot(x='Type', data=df)
plt.xlabel('Store Type')
plt.ylabel('Count')
plt.title('Distribution of Store Types')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Group by 'Date' and calculate the sum of 'Weekly_Sales'
weekly_sales_over_time = df.groupby('Date')['Weekly_Sales'].sum()

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(weekly_sales_over_time, marker='o', linestyle='-', color='b')
plt.title('Weekly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.grid(True)
plt.show()


In [None]:
# Correlation Heatmap:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is your DataFrame
correlation_matrix = df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create a heatmap with annotations
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Set the title of the plot
plt.title('Correlation Heatmap')

# Display the plot
plt.show()

# Print the correlation values
print("\nCorrelation Matrix:")
print(correlation_matrix)


In [None]:
# There is a positive correlation between 'Size' and 'Weekly_Sales', indicating that larger stores tend to have higher weekly sales.
# IsHoliday' shows a slight positive correlation with 'MarkDown3', 'MarkDown4', and 'MarkDown5', suggesting that markdowns might be more prevalent during holidays
# 'Unemployment' has a negative correlation with 'CPI', indicating a potential relationship between unemployment rates and consumer price index.

# Feature Engineering:

In [None]:
# Calculate 'Markdown_Total' by summing up all Markdowns
df['Markdown_Total'] = df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].sum(axis=1)

In [None]:
# Feature engineering for date-related features
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

In [None]:
# Drop the original 'Date' column
df = df.drop('Date', axis=1)

In [None]:
df.drop(columns=['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'], inplace=True)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of seaborn
sns.set_style("whitegrid")

# Histograms of numerical features
numerical_features = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Markdown_Total']
df[numerical_features].hist(bins=20, figsize=(15, 10), layout=(2, 3))
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.show()

# Box plots of numerical features
plt.figure(figsize=(15, 6))
sns.boxplot(data=df[numerical_features])
plt.title('Box Plots of Numerical Features', fontsize=16)
plt.xticks(rotation=45)
plt.show()

# Scatter plots of numerical features against Weekly_Sales
plt.figure(figsize=(15, 6))
for i, feature in enumerate(['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Markdown_Total'], start=1):
    plt.subplot(2, 3, i)
    sns.scatterplot(x=feature, y='Weekly_Sales', data=df, alpha=0.5)
    plt.title(f'Scatter Plot: {feature} vs Weekly_Sales', fontsize=12)
plt.tight_layout()
plt.show()

# Correlation matrix
correlation_matrix = df[numerical_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features', fontsize=16)
plt.show()


# Identifying Outliers

In [None]:
def plot(df, column):
    plt.figure(figsize=(20,5))
    plt.subplot(1,3,1)
    sns.boxplot(data=df, x=column)
    plt.title(f'Box Plot for {column}')

    plt.subplot(1,3,2)
    sns.histplot(data=df, x=column, kde=True, bins=50)
    plt.title(f'Distribution Plot for {column}')
    plt.show()

In [None]:
for i in ['Weekly_Sales', 'Temperature', 'Markdown_Total', 'Unemployment']:
    plot(df, i)

In [None]:
from scipy.stats import skew

# Assuming your data is in a DataFrame called 'df'
columns_to_check = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'Markdown_Total', 'CPI', 'Unemployment']

for column in columns_to_check:
    skewness_value = skew(df[column])
    print(f'Skewness for {column}: {skewness_value}')


In [None]:
# Add a small constant to the original values before applying the log transformation
import numpy as np

small_const = 1
df1 = df.copy()
df1['Weekly_Sales_log'] = np.log(df1['Weekly_Sales'] + small_const)
df1['Temperature_log'] = np.log(df1['Temperature'] + small_const)
df1['MarkDown_Total_log'] = np.log(df1['Markdown_Total'] + small_const)
df1['Unemployment_log'] = np.log(df1['Unemployment'] + small_const)

# Display the transformed DataFrame
print(df1)


In [None]:
# after log transformation the data reduced the skewness. [hist plot ]

for i in ['Weekly_Sales_log','Temperature_log','MarkDown_Total_log','Unemployment_log']:
     plot(df1, i)

In [None]:
from scipy.stats import skew

# List of transformed variables
transformed_variables = ['Weekly_Sales_log', 'Temperature_log', 'MarkDown_Total_log', 'Unemployment_log']

# Calculate skewness for each transformed variable
skewness_results = {}
for variable in transformed_variables:
    # Handle NaN values by replacing them with 0
    df1[variable] = df1[variable].replace([np.inf, -np.inf], np.nan).fillna(0)
    skewness = skew(df1[variable])
    skewness_results[variable] = skewness
    print(f'Skewness for {variable}: {skewness}')

# View skewness results
print("\nSkewness Results:")
print(skewness_results)

In [None]:
# Outliers Handling - Interquartile Range (IQR) method
df2 = df1.copy()
df2

In [None]:
# Using IQR and clip() methods to handle the outliers and add a new column of dataframe

def outlier(df, column):
    iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
    upper_threshold = df[column].quantile(0.75) + (1.5*iqr)
    lower_threshold = df[column].quantile(0.25) - (1.5*iqr)
    df[column] = df[column].clip(lower_threshold, upper_threshold)

In [None]:
# (Ex: lower threshold = 5 and upper threshold = 20)
# above upper threshold values (>20) are converted to upper threshold value (20) in features
# below lower threshold values (<5)  are converted to lower threshold value (5)  in features

outlier(df2, 'Weekly_Sales_log')
outlier(df2, 'Temperature_log')
outlier(df2, 'Unemployment_log')
outlier(df2, 'MarkDown_Total_log')
df2

In [None]:
for i in ['Weekly_Sales_log','Temperature_log','MarkDown_Total_log','Unemployment_log']:
     plot(df2, i)

In [None]:
# Import the necessary library
from scipy.stats import skew

# List of columns to check for skewness
columns_to_check = ['Unemployment','Weekly_Sales_log', 'Temperature_log', 'MarkDown_Total_log', 'Unemployment_log']

# Print skewness for each column
for column in columns_to_check:
    print(f"Skewness for {column}: {df2[column].skew()}")


In [None]:
df3 = df2.drop(columns=['Weekly_Sales','Temperature','Markdown_Total','Unemployment'])
df3

In [None]:
df3.dtypes

In [None]:
# Add a new column 'Expected_Sales' to calculate the sum of 'MarkDown_Total_log' and 'Weekly_Sales_log'
df4 = df3.copy()
df4['Expected_Sales'] = df4['MarkDown_Total_log'] + df4['Weekly_Sales_log']

# Display the DataFrame with the new column
print(df4)


In [None]:
# Display summary statistics of the remaining columns
summary_statistics = df4.describe()
print(summary_statistics)

In [None]:
# Save the DataFrame with selected columns to a CSV file
df4.to_csv('sales_prediction.csv', index=False)