In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install required packages
!pip install wandb pandas numpy matplotlib seaborn scikit-learn mlflow statsmodels

# Set up Kaggle API
!pip install kaggle

In [None]:
# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

In [None]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')

# Basic info about datasets
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Stores shape:", stores.shape)
print("Features shape:", features.shape)

# Display first few rows
train.head()

In [None]:
# Check data types and missing values
def explore_dataset(df, name):
    print(f"\n=== {name} Dataset ===")
    print(f"Shape: {df.shape}")
    print(f"\nData types:\n{df.dtypes}")
    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"\nUnique values per column:")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()}")

explore_dataset(train, "Train")
explore_dataset(test, "Test")
explore_dataset(stores, "Stores")
explore_dataset(features, "Features")


In [None]:
# Convert Date column to datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Time range analysis
print("Train date range:", train['Date'].min(), "to", train['Date'].max())
print("Test date range:", test['Date'].min(), "to", test['Date'].max())
print("Features date range:", features['Date'].min(), "to", features['Date'].max())

# Check for date gaps
train_dates = sorted(train['Date'].unique())
date_gaps = []
for i in range(1, len(train_dates)):
    gap = (train_dates[i] - train_dates[i-1]).days
    if gap > 7:  # More than a week
        date_gaps.append((train_dates[i-1], train_dates[i], gap))

print("Date gaps found:", len(date_gaps))

In [None]:
# Overall sales trends
plt.figure(figsize=(15, 8))

# Aggregate weekly sales
weekly_sales = train.groupby('Date')['Weekly_Sales'].sum().reset_index()

plt.subplot(2, 2, 1)
plt.plot(weekly_sales['Date'], weekly_sales['Weekly_Sales'])
plt.title('Overall Weekly Sales Trend')
plt.xticks(rotation=45)

# Sales distribution
plt.subplot(2, 2, 2)
plt.hist(train['Weekly_Sales'], bins=50, alpha=0.7)
plt.title('Weekly Sales Distribution')
plt.xlabel('Weekly Sales')

# Sales by store
plt.subplot(2, 2, 3)
store_sales = train.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending=False)
plt.bar(range(len(store_sales)), store_sales.values)
plt.title('Total Sales by Store')
plt.xlabel('Store (ranked)')

# Sales by department
plt.subplot(2, 2, 4)
dept_sales = train.groupby('Dept')['Weekly_Sales'].sum().sort_values(ascending=False).head(20)
plt.bar(range(len(dept_sales)), dept_sales.values)
plt.title('Top 20 Departments by Sales')
plt.xlabel('Department (ranked)')

plt.tight_layout()
plt.show()

In [None]:
# Add time features
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Week'] = train['Date'].dt.isocalendar().week
train['DayOfYear'] = train['Date'].dt.dayofyear

# Merge with features to get holiday information
train_features = train.merge(features, on=['Store', 'Date'], how='left')

# Holiday impact analysis
plt.figure(figsize=(15, 10))

# Monthly seasonality
plt.subplot(2, 3, 1)
monthly_sales = train.groupby('Month')['Weekly_Sales'].mean()
plt.plot(monthly_sales.index, monthly_sales.values, marker='o')
plt.title('Average Sales by Month')
plt.xlabel('Month')

# Holiday vs Non-holiday sales
plt.subplot(2, 3, 2)
holiday_sales = train_features.groupby('IsHoliday')['Weekly_Sales'].mean()
plt.bar(['Non-Holiday', 'Holiday'], holiday_sales.values)
plt.title('Average Sales: Holiday vs Non-Holiday')

# Year-over-year comparison
plt.subplot(2, 3, 3)
yearly_sales = train.groupby(['Year', 'Month'])['Weekly_Sales'].sum().unstack(level=0)
for year in yearly_sales.columns:
    plt.plot(yearly_sales.index, yearly_sales[year], marker='o', label=f'Year {year}')
plt.title('Monthly Sales by Year')
plt.legend()

# Weekly patterns
plt.subplot(2, 3, 4)
weekly_pattern = train.groupby('Week')['Weekly_Sales'].mean()
plt.plot(weekly_pattern.index, weekly_pattern.values)
plt.title('Average Sales by Week of Year')
plt.xlabel('Week')

# Temperature impact (if available)
if 'Temperature' in train_features.columns:
    plt.subplot(2, 3, 5)
    temp_sales = train_features.groupby(pd.cut(train_features['Temperature'], bins=10))['Weekly_Sales'].mean()
    plt.plot(range(len(temp_sales)), temp_sales.values)
    plt.title('Sales vs Temperature')

plt.tight_layout()
plt.show()

In [None]:
# Merge with store information
train_stores = train.merge(stores, on='Store', how='left')

# Store type analysis
plt.figure(figsize=(15, 6))

plt.subplot(1, 3, 1)
type_sales = train_stores.groupby('Type')['Weekly_Sales'].mean()
plt.bar(type_sales.index, type_sales.values)
plt.title('Average Sales by Store Type')

plt.subplot(1, 3, 2)
size_sales = train_stores.groupby(pd.cut(train_stores['Size'], bins=5))['Weekly_Sales'].mean()
plt.plot(range(len(size_sales)), size_sales.values, marker='o')
plt.title('Sales vs Store Size')

# Department performance variability
plt.subplot(1, 3, 3)
dept_variability = train.groupby('Dept')['Weekly_Sales'].std().sort_values(ascending=False).head(20)
plt.bar(range(len(dept_variability)), dept_variability.values)
plt.title('Top 20 Most Variable Departments')

plt.tight_layout()
plt.show()

In [None]:
# Negative sales analysis
negative_sales = train[train['Weekly_Sales'] < 0]
print(f"Negative sales records: {len(negative_sales)} ({len(negative_sales)/len(train)*100:.2f}%)")

if len(negative_sales) > 0:
    print("Negative sales by department:")
    print(negative_sales.groupby('Dept')['Weekly_Sales'].count().sort_values(ascending=False).head(10))

# Zero sales analysis
zero_sales = train[train['Weekly_Sales'] == 0]
print(f"Zero sales records: {len(zero_sales)} ({len(zero_sales)/len(train)*100:.2f}%)")

# Missing data patterns in features
if 'MarkDown1' in features.columns:
    markdown_cols = [col for col in features.columns if 'MarkDown' in col]
    markdown_missing = features[markdown_cols].isnull().sum()
    print("Missing markdown data:")
    print(markdown_missing)

In [None]:
# Correlation with external factors
if len(train_features.columns) > len(train.columns):
    numeric_cols = train_features.select_dtypes(include=[np.number]).columns
    correlation_matrix = train_features[numeric_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.show()
    
    # Focus on Weekly_Sales correlations
    sales_corr = correlation_matrix['Weekly_Sales'].sort_values(ascending=False)
    print("Correlations with Weekly_Sales:")
    print(sales_corr)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Select a specific store-department combination for detailed analysis
sample_data = train[(train['Store'] == 1) & (train['Dept'] == 1)].copy()
sample_data = sample_data.set_index('Date').sort_index()

if len(sample_data) > 52:  # Need enough data points
    decomposition = seasonal_decompose(sample_data['Weekly_Sales'], 
                                     model='additive', period=52)
    
    fig, axes = plt.subplots(4, 1, figsize=(15, 12))
    decomposition.observed.plot(ax=axes[0], title='Original')
    decomposition.trend.plot(ax=axes[1], title='Trend')
    decomposition.seasonal.plot(ax=axes[2], title='Seasonal')
    decomposition.resid.plot(ax=axes[3], title='Residual')
    plt.tight_layout()
    plt.show()

In [None]:
def generate_insights_summary(train, stores, features):
    insights = {}
    
    # Basic statistics
    insights['total_stores'] = train['Store'].nunique()
    insights['total_departments'] = train['Dept'].nunique()
    insights['date_range'] = (train['Date'].min(), train['Date'].max())
    insights['total_weeks'] = train['Date'].nunique()
    
    # Sales statistics
    insights['avg_weekly_sales'] = train['Weekly_Sales'].mean()
    insights['total_sales'] = train['Weekly_Sales'].sum()
    insights['negative_sales_pct'] = (train['Weekly_Sales'] < 0).mean() * 100
    
    # Store insights
    if not stores.empty:
        insights['store_types'] = stores['Type'].value_counts().to_dict()
        insights['avg_store_size'] = stores['Size'].mean()
    
    return insights

insights = generate_insights_summary(train, stores, features)
for key, value in insights.items():
    print(f"{key}: {value}")