In [0]:
!pip3 install seaborn

In [0]:
import xgboost as xgb
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, OneHotEncoder
from pandas.tseries.offsets import DateOffset
from itertools import product
from dateutil.relativedelta import relativedelta
from datetime import datetime
import plotly.graph_objects as go
#from flaml import AutoML
from sklearn.metrics import mean_squared_error, r2_score
import boto3
import os
import io
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Data via unity catalog

# Import SparkSession
from pyspark.sql import SparkSession

# Create or get SparkSession
spark = SparkSession.builder.appName("UnityCatalogAccess").getOrCreate()

# Set the current catalog and schema (database) if necessary
spark.sql("USE CATALOG `edp-apac-uat`") # Unity Catalog name
spark.sql("USE l1_asurion_apac") # Schema (database) name

# Query a table
df = spark.sql("SELECT * FROM demand_forecast limit 10") # Table name
df.show()

# Print the schema to understand data types
df.printSchema()


In [0]:
# Data via CSV
data = pd.read_csv("SG_Shipments_2019_2022.csv")

data.style

In [0]:
shipment_data = data.copy()


# # Sort the DataFrame by YearMonth to ensure it's in chronological order
shipment_data.sort_values('YearMonth', inplace=True)

shipment_data.style

In [0]:
shipment_data['YearMonth'] = shipment_data['YearMonth'].astype(str)
shipment_data['YearMonth'] = pd.to_datetime(shipment_data['YearMonth'], format='%Y%m')
shipment_data['Year'] = shipment_data['YearMonth'].dt.year
shipment_data['Month'] = shipment_data['YearMonth'].dt.month
shipment_data.drop("YearMonth", axis=1, inplace=True)

shipment_data.style


In [0]:
numerical_cols = shipment_data.select_dtypes(include=["int64", "float64"]).columns.drop('Shipped_Claim')
categorical_cols = shipment_data.select_dtypes(include=["object"]).columns
numerical_cols, categorical_cols

In [0]:
# some visualization, for now may not be very useful
plt.figure(figsize=(10, 6))
sns.histplot(data['Shipped_Claim'], kde=True)
plt.title('Distribution of Shipped Claims')
plt.xlabel('Shipped Claims')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(14, 10))
correlation_matrix = data[numerical_cols.append(pd.Index(['Shipped_Claim']))].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

for col in categorical_cols[:3]:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x=col)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [0]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# one hot encoding for now
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [0]:
model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0)

pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model_xgb)])

X = shipment_data.drop('Shipped_Claim', axis=1)
y = shipment_data['Shipped_Claim']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.head())
print(X_test.head())

# unsorted

# Country   Client               Product  ... Gross_Adds  Year Month
# 1  Singapore  SingTel  MobileSwop Unlimited  ...          0  2020     3
# 3  Singapore  SingTel  MobileSwop Unlimited  ...          0  2020     1
# 0  Singapore  SingTel  MobileSwop Unlimited  ...         11  2021     5
# 4  Singapore  SingTel  MobileSwop Unlimited  ...          0  2022     3

# [4 rows x 28 columns]
#      Country   Client               Product  ... Gross_Adds  Year Month
# 5  Singapore  SingTel  MobileSwop Unlimited  ...          0  2021    10
# 2  Singapore  SingTel  MobileSwop Unlimited  ...          0  2022     1

# [2 rows x 28 columns]

# pipeline_xgb.fit(X_train, y_train)

# predictions_xgb = pipeline_xgb.predict(X_test)
# rmse_xgb = mean_squared_error(y_test, predictions_xgb, squared=False)
# r2_xgb = r2_score(y_test, predictions_xgb)

# print(f"Root Mean Squared Error: {rmse_xgb}")
# print(f"R2 Score: {r2_xgb}")

In [0]:
# Storing ML results into unity catalog

from pyspark.sql import SparkSession
from pyspark.sql.functions import abs, col
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder.appName("ForecastAccuracy").getOrCreate()
# Set the current catalog and schema (database) if necessary
spark.sql("USE CATALOG `edp-apac-uat`") # Unity Catalog name
spark.sql("USE l1_asurion_apac") # Schema (database) name


# Sample data
data = [
    ("2024-01-01", "ModelA", 100, 105),
    ("2024-01-01", "ModelB", 200, 195),
    ("2024-02-01", "ModelA", 110, 108),
    ("2024-02-01", "ModelB", 210, 205),
    # Add more rows as needed
]

# Create DataFrame
schema = ["Date", "Model", "Forecast_Value", "Actual_Value"]
results_df = spark.createDataFrame(data, schema=schema)

# Calculate accuracy metric (Example: Absolute Percent Error)
results_df = results_df.withColumn("Absolute_Error", abs(col("Forecast_Value") - col("Actual_Value")))
results_df = results_df.withColumn("Percent_Error", (col("Absolute_Error") / col("Actual_Value")) * 100)

# Display the DataFrame
results_df.show()

#results_df = ...  # Your DataFrame containing ML results

# Write the DataFrame to a Unity Catalog table
results_df.write.format("delta").mode("overwrite").saveAsTable("l1_asurion_apac.demand_forecasting_results")

In [0]:
#running sql in notebook
%sql
SELECT count(*) FROM `edp-apac-uat`.`l1_asurion_apac`.`demand_forecast_dataset`;