In [1]:
!pip install plotly kaleido
!pip install kaggle
!mkdir -p ~/.kaggle



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
from google.colab import files
import os


# Data Expliration

In [6]:
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 1.09GB/s]
Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [8]:
!mkdir data
!unzip -o walmart-recruiting-store-sales-forecasting.zip -d data/

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: data/features.csv.zip   
  inflating: data/sampleSubmission.csv.zip  
  inflating: data/stores.csv         
  inflating: data/test.csv.zip       
  inflating: data/train.csv.zip      


In [9]:
!ls data

features.csv.zip	  stores.csv	train.csv.zip
sampleSubmission.csv.zip  test.csv.zip


In [None]:
!unzip -o data/features.csv.zip -d data/
!unzip -o data/sampleSubmission.csv.zip -d data/
!unzip -o data/test.csv.zip -d data/
!unzip -o data/train.csv.zip -d data/

# Data Exploration

In [16]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
stores_df = pd.read_csv('data/stores.csv')
features_df = pd.read_csv('data/features.csv')

In [17]:
print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")
print(f"Stores data: {stores_df.shape}")
print(f"Features data: {features_df.shape}")

print(train_df.info())

display(train_df.head())
display(stores_df.head())
display(features_df.head())

Training data: (421570, 5)
Test data: (115064, 4)
Stores data: (45, 3)
Features data: (8190, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB
None


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


### Qality Check

In [18]:
def assess_data_quality(df, name):
    print(f"\n=== {name.upper()} DATA QUALITY ===")
    print(f"Shape: {df.shape}")
    print(f"Missing values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    print(missing_df[missing_df['Missing Count'] > 0])
    print(f"\nDuplicate rows: {df.duplicated().sum()}")
    print(f"Data types:")
    print(df.dtypes)

In [19]:
for df, name in [(train_df, 'train'), (test_df, 'test'),
                 (stores_df, 'stores'), (features_df, 'features')]:
    assess_data_quality(df, name)


=== TRAIN DATA QUALITY ===
Shape: (421570, 5)
Missing values:
Empty DataFrame
Columns: [Missing Count, Missing %]
Index: []

Duplicate rows: 0
Data types:
Store             int64
Dept              int64
Date             object
Weekly_Sales    float64
IsHoliday          bool
dtype: object

=== TEST DATA QUALITY ===
Shape: (115064, 4)
Missing values:
Empty DataFrame
Columns: [Missing Count, Missing %]
Index: []

Duplicate rows: 0
Data types:
Store         int64
Dept          int64
Date         object
IsHoliday      bool
dtype: object

=== STORES DATA QUALITY ===
Shape: (45, 3)
Missing values:
Empty DataFrame
Columns: [Missing Count, Missing %]
Index: []

Duplicate rows: 0
Data types:
Store     int64
Type     object
Size      int64
dtype: object

=== FEATURES DATA QUALITY ===
Shape: (8190, 12)
Missing values:
              Missing Count  Missing %
MarkDown1              4158  50.769231
MarkDown2              5269  64.334554
MarkDown3              4577  55.885226
MarkDown4              47