In [0]:
%pip install "flaml[automl]" openml

In [0]:
dbutils.library.restartPython() #to reflect the flaml lib

In [0]:
import pandas as pd
import numpy as np
from IPython.display import display
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from pandas.tseries.offsets import DateOffset
from itertools import product
from dateutil.relativedelta import relativedelta
from datetime import datetime
import plotly.graph_objects as go
from flaml import AutoML
from sklearn.metrics import mean_squared_error, r2_score
import boto3
import os
import io
import seaborn as sns
import matplotlib.pyplot as plt


In [0]:
def read_data_from_unity_catalog():
    # Data via unity catalog
    # Set the current catalog and schema (database) if necessary
    spark.sql("USE CATALOG `edp-apac-uat`") # Unity Catalog name
    spark.sql("USE l1_asurion_apac") # Schema (database) name

    # Query a table
    preprocessed_data = spark.sql("SELECT * FROM reduced_filtered_preprocessed_data_19feb2024") # Table name #spark.read.table
    #preprocessed_data.show()

    # Print the schema to understand data types
    # preprocessed_data.printSchema()
    preprocessed_data.display()
    preprocessed_data = preprocessed_data.toPandas()
    return preprocessed_data

In [0]:
preprocessed_data = read_data_from_unity_catalog()

In [0]:
preprocessed_data.isna().sum()

In [0]:
preprocessed_data.dtypes

In [0]:
preprocessed_data['Shipped_Claim'].unique()

In [0]:
#Create lagged features for shipping claims
number_of_lags = 3
for lag in range(1, number_of_lags + 1):
    preprocessed_data[f'Lag_Shipped_Claim_{lag}'] = preprocessed_data['Shipped_Claim'].shift(lag)

In [0]:
#Create lagged features for closing subs
number_of_lags = 3
for lag in range(1, number_of_lags + 1):
    preprocessed_data[f'Lag_Closing_Base_{lag}'] = preprocessed_data['Closing_Base'].shift(lag)

In [0]:
#Add index
preprocessed_data['Unique_ID'] =  preprocessed_data['Model'].astype(str) + '-' + preprocessed_data['Country'].astype(str) 

In [0]:
preprocessed_data.display()

In [0]:
# replace lag values as na or null to zero
lag_cols= ['Lag_Closing_Base_1', 'Lag_Closing_Base_2', 'Lag_Closing_Base_3', 'Lag_Shipped_Claim_1', 'Lag_Shipped_Claim_2', 'Lag_Shipped_Claim_3']
preprocessed_data[lag_cols] = preprocessed_data[lag_cols].apply(lambda x: x.fillna(0))


In [0]:
preprocessed_data.isna().sum()

In [0]:
datetime_cols = [ 'YearMonth'] 

cat_cols = ['Unique_ID','Model','Country', 'Client', 'Product', 'Program', 'Make', 'Model_Series', 'Model_Family', 'Predecessor', 'Model_Type',
            'Successor', 'Model_No_Color']

num_cols = [ 'Model_Age_Days', 'Closing_Base', 'Model_Capacity', 'Year', 'Month', 'Product_Launch_Year', 
           'Product_Launch_Month','Model_Series_Launch_Year', 'Model_Series_Launch_Month', 'Lag_Closing_Base_1', 'Lag_Closing_Base_2', 'Lag_Closing_Base_3', 'Lag_Shipped_Claim_1', 'Lag_Shipped_Claim_2', 'Lag_Shipped_Claim_3']

target_col = ['Shipped_Claim']

In [0]:
preprocessed_data

In [0]:
final_df = preprocessed_data[ datetime_cols + cat_cols + num_cols + target_col ]

In [0]:
final_df

In [0]:
final_df.shape

In [0]:
# Convert to uppercase
def uppercase_dataframe(df):
    for col in df.select_dtypes(include = 'object').columns:
        df[col] = df[col].str.upper()
    return df
 
final_df = uppercase_dataframe(final_df)

final_df.drop(columns = ['Churn', 'Gross_Adds'], axis=1, inplace= True)
final_df.drop(columns = ['Model', 'Unique_ID'])

#Label Encoding
from sklearn.preprocessing import LabelEncoder

# Label Encoding for categorical variables
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    final_df[col] = label_encoders[col].fit_transform(final_df[col])

final_df.drop_duplicates(inplace=True)    
final_df.sort_values(by='YearMonth', inplace=True)

In [0]:
final_df

In [0]:
#!pip install h2o

In [0]:
# # automl experimentation
# import pandas as pd
# import h2o
# from h2o.automl import H2OAutoML
# from h2o.frame import H2OFrame

# # Initialize H2O
# h2o.init()

# # Sort the DataFrame by the time column
# #final_df = final_df.sort_values('YearMonth')

# # Calculate the split point (80% training, 20% testing)
# split_point = int(len(final_df) * 0.875)

# # Split the data
# train_df = final_df.iloc[:split_point]
# test_df = final_df.iloc[split_point:]

# # Reset index in both training and test DataFrames
# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)

# # Convert pandas DataFrames to H2OFrames
# train = h2o.H2OFrame(train_df)
# test = h2o.H2OFrame(test_df)

# # Define the target and features
# target = 'Shipped_Claim'
# features = [col for col in train.columns if col != target]

# # Set up H2O AutoML
# automl = H2OAutoML(max_runtime_secs=1200, max_models=20, seed=1, stopping_metric='RMSE', sort_metric='RMSE')

# # Train models
# automl.train(x=features, y=target, training_frame=train)

# # View the AutoML Leaderboard
# lb = automl.leaderboard
# print(lb)

# # Get the best model
# best_model = automl.leader

# # Model performance on the training set
# train_performance = best_model.model_performance(train)
# print("\nTraining Performance")
# print("RMSE:", train_performance.rmse())
# print("MAE:", train_performance.mae())
# print("R2:", train_performance.r2())

# # Model performance on the test set
# test_performance = best_model.model_performance(test)
# print("\nTest Performance")
# print("RMSE:", test_performance.rmse())
# print("MAE:", test_performance.mae())
# print("R2:", test_performance.r2())

# # Example: Shut down H2O - Uncomment the below line when you are done with H2O
# # h2o.shutdown()

In [0]:
import pandas as pd 
from datetime import datetime 
def split_dataset(df, test_months): 
    # Convert year and month to a datetime to ease sorting and manipulation
    df['YearMonth'] = pd.to_datetime(df['YearMonth'])
    df = df.sort_values(by='YearMonth') # Ensure the DataFrame is sorted by date 
    # Find the split date by subtracting test_months from the max date 
    max_date = df['YearMonth'].max() 
    split_date = max_date - pd.DateOffset(months=test_months) 
    # Split the dataset 
    train_df = df[df['YearMonth'] <= split_date] 
    test_df = df[df['YearMonth'] > split_date] 
    # Drop the date column if no longer needed 
    train_df = train_df.drop(columns=['YearMonth']) 
    test_df = test_df.drop(columns=['YearMonth']) 
    
    return train_df, test_df 


test_months = 6 #  reserving the last n months of data for testing 
train, test = split_dataset(final_df, test_months) 

# AutoML setup
# Define settings dictionary
settings = { 
    "time_budget": 3600,
    "metric": "r2", 
    "estimator_list": ['xgboost'], 
    "task": 'regression', 
    "seed": 42, 
    "early_stop" : True
    } 

# Initialize AutoML 
automl = AutoML() 

# Train the model 
automl.fit(X_train=train.drop('Shipped_Claim', axis=1), y_train=train['Shipped_Claim'], **settings) 
# Predictions 
preds = automl.predict(test.drop('Shipped_Claim', axis=1)) 
# Print the results 
print('Best hyperparameter config:', automl.best_config) 
print('Best r2 on validation data: {:.4g}'.format(1 - automl.best_loss)) 
print('Training duration of best run: {:.4g} s'.format(automl.best_config_train_time)) 
print(automl.model.estimator)

In [0]:
from sklearn import metrics 
# Calculate RMSE, R2, and MAE for training set 
train_true = train['Shipped_Claim'] 
train_preds = automl.predict(train.drop('Shipped_Claim', axis=1)) 
train_rmse = metrics.mean_squared_error(train_true, train_preds, squared=False) 
train_r2 = metrics.r2_score(train_true, train_preds) 
train_mae = metrics.mean_absolute_error(train_true, train_preds) 
# Calculate RMSE, R2, and MAE for test set 
test_true = test['Shipped_Claim'] 
test_preds = preds 
# assuming 'preds' contains your test set predictions from the previous step 
test_rmse = metrics.mean_squared_error(test_true, test_preds, squared=False) 
test_r2 = metrics.r2_score(test_true, test_preds) 
test_mae = metrics.mean_absolute_error(test_true, test_preds) 
# Print the metrics 
print(f"Training RMSE: {train_rmse}") 
print(f"Training R^2: {train_r2}") 
print(f"Training MAE: {train_mae}\n") 
print(f"Test RMSE: {test_rmse}") 
print(f"Test R^2: {test_r2}") 
print(f"Test MAE: {test_mae}")

In [0]:
test['forecast'] = preds

In [0]:
test

In [0]:
predictions = test.copy()

In [0]:
for col in cat_cols:
    predictions[col] = label_encoders[col].inverse_transform(test[col])

In [0]:
predictions

In [0]:
predictions['forecast'] = predictions['forecast'].round().astype(int)

def calculate_accuracy(row):
    try:
        forecast=row['forecast']
        Shipped_Claim=row ['Shipped_Claim']
        max_value=max(forecast, Shipped_Claim)
        if max_value == 0:
            return 0
        else:
            return (min(forecast, Shipped_Claim) /max_value)*100
    except ZeroDivisionError:
        return 0
predictions['Accuracy']=predictions.apply(calculate_accuracy, axis=1)

# Add 'Zero_Case' column based on 'Claims' value
predictions['Zero_Case'] = predictions['Shipped_Claim'].apply(lambda x: 'Non_0' if x != 0 else '0_Case')
# Calculate 'Unit Variance' as Claims - Forecast
predictions['Unit_Variance'] = predictions['Shipped_Claim'] - predictions['forecast']
# Function to categorize Accuracy into ranges
def categorize_accuracy(acc):
    if acc < 25:
        return '<25%'
    elif 25 <= acc < 50:
        return '25%-50%'
    elif 50 <= acc < 75:
        return '50%-75%'
    elif 75 <= acc < 90:
        return '75%-90%'
    elif 90 <= acc < 100:
        return '90%-100%'
    else:  # This covers the case of exactly 100% accuracy
        return '100%'
# Apply the function to create 'Accuracy Range' column
predictions['Accuracy_Range'] = predictions['Accuracy'].apply(categorize_accuracy)

In [0]:
Predictions_NonZero_Case = predictions[(predictions['Zero_Case'] =='Non_0')]

In [0]:
Predictions_NonZero_Case

In [0]:
Accuracy_Categories = Predictions_NonZero_Case['Accuracy_Range'].value_counts()

In [0]:
Accuracy_Categories

In [0]:
results_df = spark.createDataFrame(predictions, schema=list(predictions.columns))
results_df.write.mode("overwrite").saveAsTable("l1_asurion_apac.demand_forecasting_results")

In [0]:
predictions.shape

In [0]:
predictions.to_csv('demand_forecasting_19feb.csv', index=False)