<a href="https://colab.research.google.com/github/dots13/ForecastingStickerSalesKaggle/blob/main/LGBM_separate_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import requests

import matplotlib.pyplot as plt
import seaborn as sns
import holidays

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from scipy.fftpack import fft

import sklearn
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['date'] = pd.to_datetime(train['date'])

# Filter rows with dates after 2015
train = train[train['date'].dt.year > 2015]

In [None]:
def get_gdp_per_capita(alpha3, year):
    """
    Fetch GDP per capita for a specific country and year from the World Bank API.

    """
    url = f'https://api.worldbank.org/v2/country/{alpha3}/indicator/NY.GDP.PCAP.CD?date={year}&format=json'
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return data[1][0]['value'] if data[1] else None
    except (requests.RequestException, KeyError, IndexError) as e:
        print(f"Error fetching data for {alpha3} in {year}: {e}")
        return None

In [None]:
def create_gdp_dataframe(alpha3s, years, country_names):
    """
    Create a DataFrame of normalized GDP per capita ratios for multiple countries and years.

    """
    # Fetch GDP data for all countries and years
    gdp_data = [
        [get_gdp_per_capita(alpha3, year) for year in years]
        for alpha3 in alpha3s
    ]

    # Create a DataFrame with countries as rows and years as columns
    gdp_df = pd.DataFrame(gdp_data, index=country_names, columns=years)

    # Normalize GDP values by dividing by the column sum (yearly total)
    gdp_df = gdp_df / gdp_df.sum(axis=0)

    # Reshape the DataFrame into long format
    gdp_df = gdp_df.reset_index().rename(columns={'index': 'country'})
    gdp_df = gdp_df.melt(id_vars=['country'], var_name='year', value_name='ratio')

    return gdp_df

In [None]:
def adjust_ratios(gdp_df, adjustments):
    """
    Adjust GDP ratios for specific countries based on custom rules.

    Parameters:
    - gdp_df: DataFrame containing GDP ratios.
    - adjustments: Dictionary with country names as keys and adjustment values.

    Returns:
    - Adjusted DataFrame with updated ratios.
    """
    adjusted_df = gdp_df.copy()

    # Apply adjustments safely
    for country, adjustment in adjustments.items():
        if country in adjusted_df['country'].unique():
            adjusted_df.loc[adjusted_df['country'] == country, 'ratio'] = (
                adjusted_df.loc[adjusted_df['country'] == country, 'ratio'] - adjustment
            ).clip(lower=0)  # Ensure ratios don't become negative

    return adjusted_df

In [None]:
alpha3s = ['CAN', 'FIN', 'ITA', 'KEN', 'NOR', 'SGP']
years = range(2010, 2020)
country_names = np.array(['Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore'])  # Sorted automatically
gdp_ratios_df = create_gdp_dataframe(alpha3s, years, country_names)
adjustments = {'Kenya': 0.0007}
gdp_per_capita_filtered_ratios_df = adjust_ratios(gdp_ratios_df, adjustments)
print(gdp_per_capita_filtered_ratios_df.head(6))

In [None]:
train_df_imputed = train.copy()
print(f"Missing values remaining: {train_df_imputed['num_sold'].isna().sum()}")

# Extract the year from the date
train_df_imputed['date'] = pd.to_datetime(train_df_imputed['date'])
train_df_imputed["year"] = train_df_imputed["date"].dt.year

# Loop through each year to perform imputation
for year in train_df_imputed["year"].unique():
    # Target ratio (Norway)
    target_ratio = gdp_per_capita_filtered_ratios_df.loc[
        (gdp_per_capita_filtered_ratios_df["year"] == year) &
        (gdp_per_capita_filtered_ratios_df["country"] == "Norway"), "ratio"
    ].values[0]

    # Impute Time Series 1: Canada, Discount Stickers, Holographic Goose
    current_ratio_can = gdp_per_capita_filtered_ratios_df.loc[
        (gdp_per_capita_filtered_ratios_df["year"] == year) &
        (gdp_per_capita_filtered_ratios_df["country"] == "Canada"), "ratio"
    ].values[0]
    ratio_can = current_ratio_can / target_ratio
    train_df_imputed.loc[
        (train_df_imputed["country"] == "Canada") &
        (train_df_imputed["store"] == "Discount Stickers") &
        (train_df_imputed["product"] == "Holographic Goose") &
        (train_df_imputed["year"] == year),
        "num_sold"
    ] = (
        train_df_imputed.loc[
            (train_df_imputed["country"] == "Norway") &
            (train_df_imputed["store"] == "Discount Stickers") &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year),
            "num_sold"
        ] * ratio_can
    ).values

    # Impute Time Series 2-3: Canada, Premium Sticker Mart / Stickers for Less
    for store in ["Premium Sticker Mart", "Stickers for Less"]:
        current_ts = train_df_imputed.loc[
            (train_df_imputed["country"] == "Canada") &
            (train_df_imputed["store"] == store) &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year)
        ]
        missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
        train_df_imputed.loc[
            (train_df_imputed["country"] == "Canada") &
            (train_df_imputed["store"] == store) &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year) &
            (train_df_imputed["date"].isin(missing_ts_dates)),
            "num_sold"
        ] = (
            train_df_imputed.loc[
                (train_df_imputed["country"] == "Norway") &
                (train_df_imputed["store"] == store) &
                (train_df_imputed["product"] == "Holographic Goose") &
                (train_df_imputed["year"] == year) &
                (train_df_imputed["date"].isin(missing_ts_dates)),
                "num_sold"
            ] * ratio_can
        ).values

    # Impute Time Series 4: Kenya, Discount Stickers, Holographic Goose
    current_ratio_ken = gdp_per_capita_filtered_ratios_df.loc[
        (gdp_per_capita_filtered_ratios_df["year"] == year) &
        (gdp_per_capita_filtered_ratios_df["country"] == "Kenya"), "ratio"
    ].values[0]
    ratio_ken = current_ratio_ken / target_ratio
    train_df_imputed.loc[
        (train_df_imputed["country"] == "Kenya") &
        (train_df_imputed["store"] == "Discount Stickers") &
        (train_df_imputed["product"] == "Holographic Goose") &
        (train_df_imputed["year"] == year),
        "num_sold"
    ] = (
        train_df_imputed.loc[
            (train_df_imputed["country"] == "Norway") &
            (train_df_imputed["store"] == "Discount Stickers") &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year),
            "num_sold"
        ] * ratio_ken
    ).values

    # Impute Time Series 5-6: Kenya, Premium Sticker Mart / Stickers for Less
    for store in ["Premium Sticker Mart", "Stickers for Less"]:
        current_ts = train_df_imputed.loc[
            (train_df_imputed["country"] == "Kenya") &
            (train_df_imputed["store"] == store) &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year)
        ]
        missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
        train_df_imputed.loc[
            (train_df_imputed["country"] == "Kenya") &
            (train_df_imputed["store"] == store) &
            (train_df_imputed["product"] == "Holographic Goose") &
            (train_df_imputed["year"] == year) &
            (train_df_imputed["date"].isin(missing_ts_dates)),
            "num_sold"
        ] = (
            train_df_imputed.loc[
                (train_df_imputed["country"] == "Norway") &
                (train_df_imputed["store"] == store) &
                (train_df_imputed["product"] == "Holographic Goose") &
                (train_df_imputed["year"] == year) &
                (train_df_imputed["date"].isin(missing_ts_dates)),
                "num_sold"
            ] * ratio_ken
        ).values

    # Impute Time Series 7: Kenya, Discount Stickers, Kerneler
    current_ts = train_df_imputed.loc[
        (train_df_imputed["country"] == "Kenya") &
        (train_df_imputed["store"] == "Discount Stickers") &
        (train_df_imputed["product"] == "Kerneler") &
        (train_df_imputed["year"] == year)
    ]
    missing_ts_dates = current_ts.loc[current_ts["num_sold"].isna(), "date"]
    train_df_imputed.loc[
        (train_df_imputed["country"] == "Kenya") &
        (train_df_imputed["store"] == "Discount Stickers") &
        (train_df_imputed["product"] == "Kerneler") &
        (train_df_imputed["year"] == year) &
        (train_df_imputed["date"].isin(missing_ts_dates)),
        "num_sold"
    ] = (
        train_df_imputed.loc[
            (train_df_imputed["country"] == "Norway") &
            (train_df_imputed["store"] == "Discount Stickers") &
            (train_df_imputed["product"] == "Kerneler") &
            (train_df_imputed["year"] == year) &
            (train_df_imputed["date"].isin(missing_ts_dates)),
            "num_sold"
        ] * ratio_ken
    ).values

# Check for remaining missing values
print(f"Missing values remaining after imputation: {train_df_imputed['num_sold'].isna().sum()}")

# Manual imputation for specific IDs
train_df_imputed.loc[train_df_imputed["id"] == 23719, "num_sold"] = 4
train_df_imputed.loc[train_df_imputed["id"] == 207003, "num_sold"] = 195

# Final check for missing values
print(f"Final missing values remaining: {train_df_imputed['num_sold'].isna().sum()}")

In [None]:
train['cat'] = train['country'] + '_' + train['store'] + '_' + train['product']

In [None]:
train = train_df_imputed.copy()
train['cat'] = train['country'] + '_' + train['store'] + '_' + train['product']
train["year"] = train["date"].dt.year
train["month"] = train["date"].dt.month
train["day_of_week"] = train["date"].dt.dayofweek
train["day_of_year"] = train["date"].dt.dayofyear
train["week_of_year"] = train["date"].dt.isocalendar().week
train["is_weekend"] = train["day_of_week"].isin([5, 6]).astype(int)
train["quarter"] = train["date"].dt.quarter

# Create mapping
store_mapping = {k: v for v, k in enumerate(train["store"].unique())}
product_mapping = {k: v for v, k in enumerate(train["product"].unique())}

#train["store_encoded"] = train["store"].map(store_mapping)
#train["productt_encoded"] = train["product"].map(product_mapping)

### **Adding Holiday Feature**
country_holiday_map = {
    "Canada": "CA",
    "Finland": "FI",
    "Italy": "IT",
    "Kenya": "KE",
    "Norway": "NO",
    "Singapore": "SG"
}

# Create holiday dictionaries for each country
holidays_dict = {
    country: holidays.country_holidays(iso_code, years=range(train["year"].min(), train["year"].max() + 1))
    for country, iso_code in country_holiday_map.items()
}

train["is_holiday"] = train.apply(lambda row: int(row["date"] in holidays_dict[row["country"]]), axis=1)

### **Adding Sin/Cos Features for Cyclical Encoding**
train["month_sin"] = np.sin(2 * np.pi * train["month"] / 12.0)
train["month_cos"] = np.cos(2 * np.pi * train["month"] / 12.0)
train["day_of_week_sin"] = np.sin(2 * np.pi * train["day_of_week"] / 7.0)
train["day_of_week_cos"] = np.cos(2 * np.pi * train["day_of_week"] / 7.0)
train["day_of_year_sin"] = np.sin(2 * np.pi * train["day_of_year"] / 365.0)
train["day_of_year_cos"] = np.cos(2 * np.pi * train["day_of_year"] / 365.0)
train["week_of_year_sin"] = np.sin(2 * np.pi * train["week_of_year"] / 52.0)
train["week_of_year_cos"] = np.cos(2 * np.pi * train["week_of_year"] / 52.0)

train["half_year_sin"] = np.sin(2 * np.pi * train["month"] / 6)
train["half_year_cos"] = np.cos(2 * np.pi * train["month"] / 6)

# 2-year cycle
train["two_year_sin"] = np.sin(2 * np.pi * train["year"] / 2)
train["two_year_cos"] = np.cos(2 * np.pi * train["year"] / 2)

# 3-year cycle
train["three_year_sin"] = np.sin(2 * np.pi * train["year"] / 3)
train["three_year_cos"] = np.cos(2 * np.pi * train["year"] / 3)

# 4-year cycle
train["four_year_sin"] = np.sin(2 * np.pi * train["year"] / 4)
train["four_year_cos"] = np.cos(2 * np.pi * train["year"] / 4)

## # Add country ratios

In [None]:
train.head()

In [None]:
train.columns

In [None]:
features = ["year",
            "month",
            "day_of_week",
            "day_of_year",
            "week_of_year",
            "is_weekend",
            "quarter",
            #'store_encoded',
            #'productt_encoded',
            'is_holiday',
            'month_sin',
            'month_cos',
            'day_of_week_sin',
            'day_of_week_cos',
            'day_of_year_sin',
            'day_of_year_cos',
            'week_of_year_sin',
            'week_of_year_cos',
            'half_year_sin',
            'half_year_cos',
            'two_year_sin',
            'two_year_cos',
            'three_year_sin',
            'three_year_cos',
            'four_year_sin',
            'four_year_cos'
            ]

In [None]:
gdp_per_capita_filtered_ratios_df

In [None]:
cat_unique = train['cat'].unique()
models = {}
df = train.copy()

for cu in cat_unique:
    print(f"Training model for {cu}...")

    # Filter data for the country
    country_data = df[df['cat'] == cu]

    X_train, y_train = country_data[features], country_data["num_sold"]

    # Define LightGBM parameters (you can tune these)
    lgb_model = LGBMRegressor(
    boosting_type="gbdt",
    objective="poisson",
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=0
    )

    # Train the model
    lgb_model.fit(X_train, y_train)

    # Store the model
    models[cu] = lgb_model

    # Make predictions
    y_pred =lgb_model.predict(X_train)

    # Calculate the error
    mape = mean_absolute_percentage_error(y_train, y_pred)
    print(f"Mean Absolute Percentage Error (MAPE) for {cu}: {mape}")

In [None]:
models

In [None]:
test['date'] = pd.to_datetime(test['date'])
test['cat'] = test['country'] + '_' + test['store'] + '_' + test['product']
test["year"] = test["date"].dt.year
test["month"] = test["date"].dt.month
test["day_of_week"] = test["date"].dt.dayofweek
test["day_of_year"] = test["date"].dt.dayofyear
test["week_of_year"] = test["date"].dt.isocalendar().week
test["is_weekend"] = test["day_of_week"].isin([5, 6]).astype(int)
test["quarter"] = test["date"].dt.quarter

# Create mapping
store_mapping = {k: v for v, k in enumerate(test["store"].unique())}
product_mapping = {k: v for v, k in enumerate(test["product"].unique())}

### **Adding Holiday Feature**
country_holiday_map = {
    "Canada": "CA",
    "Finland": "FI",
    "Italy": "IT",
    "Kenya": "KE",
    "Norway": "NO",
    "Singapore": "SG"
}

# Create holiday dictionaries for each country
holidays_dict = {
    country: holidays.country_holidays(iso_code, years=range(test["year"].min(), test["year"].max() + 1))
    for country, iso_code in country_holiday_map.items()
}

test["is_holiday"] = test.apply(lambda row: int(row["date"] in holidays_dict[row["country"]]), axis=1)

### **Adding Sin/Cos Features for Cyclical Encoding**
test["month_sin"] = np.sin(2 * np.pi * test["month"] / 12.0)
test["month_cos"] = np.cos(2 * np.pi * test["month"] / 12.0)
test["day_of_week_sin"] = np.sin(2 * np.pi * test["day_of_week"] / 7.0)
test["day_of_week_cos"] = np.cos(2 * np.pi * test["day_of_week"] / 7.0)
test["day_of_year_sin"] = np.sin(2 * np.pi * test["day_of_year"] / 365.0)
test["day_of_year_cos"] = np.cos(2 * np.pi * test["day_of_year"] / 365.0)
test["week_of_year_sin"] = np.sin(2 * np.pi * test["week_of_year"] / 52.0)
test["week_of_year_cos"] = np.cos(2 * np.pi * test["week_of_year"] / 52.0)

test["half_year_sin"] = np.sin(2 * np.pi * test["month"] / 6)
test["half_year_cos"] = np.cos(2 * np.pi * test["month"] / 6)

# 2-year cycle
test["two_year_sin"] = np.sin(2 * np.pi * test["year"] / 2)
test["two_year_cos"] = np.cos(2 * np.pi * test["year"] / 2)

# 3-year cycle
test["three_year_sin"] = np.sin(2 * np.pi * test["year"] / 3)
test["three_year_cos"] = np.cos(2 * np.pi * test["year"] / 3)

# 4-year cycle
test["four_year_sin"] = np.sin(2 * np.pi * test["year"] / 4)
test["four_year_cos"] = np.cos(2 * np.pi * test["year"] / 4)

In [None]:
test.columns

In [None]:
test_data = test.copy()
predictions = {}

final_predictions = []
for cu in cat_unique:
    print(f"Making predictions for {cu}...")
    country_test_data = test_data[test_data['cat'] == cu]
    X_test = country_test_data[features]
    model = models[cu]
    y_pred = model.predict(X_test)
    country_test_data['predicted_num_sold'] = y_pred
    country_predictions = country_test_data[['id', 'predicted_num_sold']]
    final_predictions.append(country_predictions)

    print(f"Predictions for {cu} added to the final dataframe.")

all_predictions_df = pd.concat(final_predictions, ignore_index=True)
print(all_predictions_df.head())

In [None]:
all_predictions_df

In [None]:
all_predictions_df.columns = ['id', 'num_sold']

In [None]:
all_predictions_df.set_index("id", inplace=True)
all_predictions_df["num_sold"] = all_predictions_df["num_sold"].round()
all_predictions_df.to_csv("submission_13.csv")