In [0]:
# %pip install "flaml[automl]" openml

#dbutils.library.restartPython() #to reflect the flaml lib

In [0]:
import pandas as pd
import numpy as np
from IPython.display import display
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from pandas.tseries.offsets import DateOffset
from itertools import product
from dateutil.relativedelta import relativedelta
from datetime import datetime
import plotly.graph_objects as go
from flaml import AutoML
from sklearn.metrics import mean_squared_error, r2_score
import boto3
import os
import io

In [0]:
def read_data_from_unity_catalog():
    # Data via unity catalog
    # Set the current catalog and schema (database) if necessary
    spark.sql("USE CATALOG `edp-apac-uat`") # Unity Catalog name
    spark.sql("USE l1_asurion_apac") # Schema (database) name

    # Query a table
    preprocessed_data = spark.sql("SELECT * FROM preprocessed_data_15feb2024") # Table name #spark.read.table
    #preprocessed_data.show()

    # Print the schema to understand data types
    # preprocessed_data.printSchema()
    preprocessed_data.display()
    return preprocessed_data

In [0]:
preprocessed_data = read_data_from_unity_catalog()

In [0]:
#preprocessed_data.isnull().sum()
#print(preprocessed_data.isna())  
#type(preprocessed_data)

preprocessed_data.isna().sum()

In [0]:
preprocessed_data.isna().sum()

In [0]:
preprocessed_data['Predecessor'].fillna('None', inplace=True)
preprocessed_data['Successor'].fillna('None', inplace=True)
preprocessed_data['Model_Series'].fillna('None', inplace=True)

In [0]:
preprocessed_data.isnull().sum()

In [0]:
preprocessed_data.dtypes

In [0]:
preprocessed_data['Product_Launch_Date'].unique()

In [0]:
preprocessed_data['Model_Series_Launch_Date'].unique()

In [0]:
import pandas as pd 
def parse_mixed_dates(date_str): # Define the date formats to try 
    date_formats = ['%Y-%m-%d', '%d-%m-%Y'] 

    # Iterate over the date formats and try to convert the date string to a datetime object 
    for fmt in date_formats: 
        try: 
            return pd.to_datetime(date_str, format=fmt) 
        except ValueError: 
            continue 
    # If none of the formats work, return pd.NaT 
    return pd.NaT # Assuming 'df' is your DataFrame and 'mixed_date_col' is the column with mixed date formats

preprocessed_data['Product_Launch_Date'] = preprocessed_data['Product_Launch_Date'].apply(parse_mixed_dates)
# Verify the result 
print(preprocessed_data['Product_Launch_Date'].head())

In [0]:
preprocessed_data['Model_Series_Launch_Date'] = preprocessed_data['Model_Series_Launch_Date'].apply(parse_mixed_dates)

In [0]:
print(preprocessed_data['Model_Series_Launch_Date'].head())

In [0]:
preprocessed_data['Product_Launch_Date'].unique()

In [0]:
preprocessed_data['Model_Series_Launch_Date'].unique()

In [0]:
preprocessed_data.isnull().sum()

In [0]:
# Extract year and month into new columns for 'Product_Launch_Date' 
preprocessed_data['Product_Launch_Year'] = preprocessed_data['Product_Launch_Date'].dt.year 
preprocessed_data['Product_Launch_Month'] = preprocessed_data['Product_Launch_Date'].dt.month 
# Extract year and month into new columns for 'Model_Series_Launch_Date' 
preprocessed_data['Model_Series_Launch_Year'] = preprocessed_data['Model_Series_Launch_Date'].dt.year 
preprocessed_data['Model_Series_Launch_Month'] = preprocessed_data['Model_Series_Launch_Date'].dt.month 

In [0]:
preprocessed_data.isnull().sum()

In [0]:
preprocessed_data

In [0]:
# Make Model Type columns
type_patterns = ['PLUS', 'ULTRA', 'FOLD', 'EDGE', 'DUO', 'FLIP', 'FE','STAR','LITE','PRO MAX', 'PRO', 'MINI', 'MAX']

# Extract MODEL_TYPE based on patterns
preprocessed_data['Model_Type'] = preprocessed_data['Model_Family'].str.extract('(' + '|'.join(type_patterns) + ')', expand=False)

# Fill missing values in MODEL_TYPE with 'Regular'
preprocessed_data['Model_Type'].fillna('BASIC', inplace=True)


In [0]:
preprocessed_data

In [0]:
"""import pandas as pd
from statsmodels.tsa.stattools import pacf


def determine_optimal_lags(df, target_column, series_column, pacf_threshold=0.2, max_lags_limit=40):
    if df.empty:
        raise ValueError("The DataFrame is empty.")
    if target_column not in df.columns or series_column not in df.columns:
        raise ValueError("Target or series column not found in DataFrame.")

    optimal_lags = {}
    for series in df[series_column].unique():
        series_data = df[df[series_column] == series][target_column].dropna()
        if len(series_data) < 2:
            continue  # Skip series with too few data points

        # Adjusting max_lags to be within the allowed limit
        max_allowed_lags = len(series_data) // 2 - 1
        max_lags = min(max_lags_limit, max_allowed_lags)
        
        if max_lags < 1:
            continue

        pacf_vals = pacf(series_data, nlags=max_lags, method='ols')
        significant_lags = [i for i, val in enumerate(pacf_vals) if abs(val) > pacf_threshold]
        
        if significant_lags:
            optimal_lags[series] = max(significant_lags)
        else:
            optimal_lags[series] = 0  # No significant lags found

    return optimal_lags



def create_common_optimal_lagged_features(df, target_column, series_column, optimal_lags):
    if df.empty:
        raise ValueError("The DataFrame is empty.")
    if target_column not in df.columns or series_column not in df.columns:
        raise ValueError("Target or series column not found in DataFrame.")

    # Determine a common optimal number of lags (e.g., median or maximum)
    common_optimal_lag = np.median(list(optimal_lags.values()))

    lagged_df = pd.DataFrame()
    for series in df[series_column].unique():
        series_data = df[df[series_column] == series].copy()

        for lag in range(1, int(common_optimal_lag) + 1):
            series_data[f'lag_{lag}'] = series_data[target_column].shift(lag)
            
        # Handle missing values in newly created features
        #series_data.dropna(inplace=True)

        # First interpolate, then forward fill
        series_data.interpolate(method='linear', inplace=True)
        series_data.fillna(method='ffill', inplace=True)
        
        

        lagged_df = pd.concat([lagged_df, series_data], ignore_index=True)

    return lagged_df
    

series_column = 'Model'
target_column = 'Shipped_Claim'
optimal_lags = determine_optimal_lags(preprocessed_data, target_column, series_column)
lagged_df = create_common_optimal_lagged_features(preprocessed_data, target_column, series_column, optimal_lags)
"""

In [0]:
preprocessed_data.sort_values(by='YearMonth', inplace=True)

In [0]:
import pandas as pd 
import numpy as np 
from statsmodels.tsa.stattools import pacf 
from statsmodels.graphics.tsaplots import plot_pacf 
import matplotlib.pyplot as plt 

# Assuming 'df' is your DataFrame and it has a column 'y' which is the time series data 
# Calculate PACF with confidence intervals 
lags = 40 # Define the number of lags you want to test 
alpha = 0.05 # Significance level for the confidence intervals 

pacf_values, confint = pacf(preprocessed_data['Shipped_Claim'], nlags=lags, alpha=alpha) 

# Plot PACF with confidence intervals 
plot_pacf(preprocessed_data['Shipped_Claim'], lags=lags, alpha=alpha) 
plt.show() 

# Automatically identify the optimal lag 
# The optimal lag is considered to be the last significant lag, 
# i.e., the last lag before the PACF falls within the confidence interval bounds for the first time. 
optimal_lag = 0 
for i in range(1, len(pacf_values)): 
    if pacf_values[i] < confint[i][0] or pacf_values[i] > confint[i][1]: 
        optimal_lag = i 
    else: break # Stop at the first non-significant lag 
print(f"Optimal lag: {optimal_lag}")

In [0]:
"""# Create lag columns based on the optimal lag 
for lag in range(1, optimal_lag + 1): 
    preprocessed_data[f'Lag_{lag}'] = preprocessed_data['Shipped_Claim'].shift(lag) 
# Display the first few rows to verify the new columns 
print(df.head())
"""

In [0]:
preprocessed_data['Shipped_Claim'].unique()

In [0]:
#Create lagged features
number_of_lags = 3
for lag in range(1, number_of_lags + 1):
    preprocessed_data[f'Lag_{lag}'] = preprocessed_data['Shipped_Claim'].shift(lag)

In [0]:
preprocessed_data

In [0]:
preprocessed_data.isna().sum()

In [0]:
lag_cols= ['Lag_1', 'Lag_2', 'Lag_3']
preprocessed_data[lag_cols] = preprocessed_data[lag_cols].apply(lambda x: x.fillna(0))


In [0]:
preprocessed_data.isna().sum()

In [0]:
preprocessed_data.columns

In [0]:
datetime_cols = [ 'YearMonth'] 

cat_cols = ['Country', 'Client', 'Product', 'Program', 'Make', 'Model_Series', 'Model_Family', 'Predecessor', 'Model_Type',
            'Successor','Model', 'Model_No_Color']

num_cols = [ 'Model_Age_Days', 'Closing_Base', 'Model_Capacity', 'Year', 'Month', 'Product_Launch_Year', 
           'Product_Launch_Month','Model_Series_Launch_Year', 'Model_Series_Launch_Month', 'Lag_1', 'Lag_2', 'Lag_3']

target_col = ['Shipped_Claim']

In [0]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder

# Label Encoding for categorical variables
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    preprocessed_data[col] = label_encoders[col].fit_transform(preprocessed_data[col])

In [0]:
preprocessed_data

In [0]:
preprocessed_data.head()

In [0]:
preprocessed_data.sort_values(by='YearMonth', inplace=True)

In [0]:
final_df = preprocessed_data[ datetime_cols + cat_cols + num_cols + target_col ]

In [0]:
final_df

In [0]:
final_df.drop_duplicates(inplace=True)

In [0]:
final_df.shape

In [0]:
#!pip install h2o

In [0]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from h2o.frame import H2OFrame

# Initialize H2O
h2o.init()

# Sort the DataFrame by the time column
#final_df = final_df.sort_values('YearMonth')

# Calculate the split point (80% training, 20% testing)
split_point = int(len(final_df) * 0.875)

# Split the data
train_df = final_df.iloc[:split_point]
test_df = final_df.iloc[split_point:]

# Reset index in both training and test DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Convert pandas DataFrames to H2OFrames
train = h2o.H2OFrame(train_df)
test = h2o.H2OFrame(test_df)

# Define the target and features
target = 'Shipped_Claim'
features = [col for col in train.columns if col != target]

# Set up H2O AutoML
automl = H2OAutoML(max_runtime_secs=1200, max_models=20, seed=1, stopping_metric='RMSE', sort_metric='RMSE')

# Train models
automl.train(x=features, y=target, training_frame=train)

# View the AutoML Leaderboard
lb = automl.leaderboard
print(lb)

# Get the best model
best_model = automl.leader

# Model performance on the training set
train_performance = best_model.model_performance(train)
print("\nTraining Performance")
print("RMSE:", train_performance.rmse())
print("MAE:", train_performance.mae())
print("R2:", train_performance.r2())

# Model performance on the test set
test_performance = best_model.model_performance(test)
print("\nTest Performance")
print("RMSE:", test_performance.rmse())
print("MAE:", test_performance.mae())
print("R2:", test_performance.r2())

# Example: Shut down H2O - Uncomment the below line when you are done with H2O
# h2o.shutdown()

In [0]:
import pandas as pd 
from datetime import datetime 
def split_dataset(df, test_months): 
    # Convert year and month to a datetime to ease sorting and manipulation
    df['YearMonth'] = pd.to_datetime(df['YearMonth'])
    df = df.sort_values(by='YearMonth') # Ensure the DataFrame is sorted by date 
    # Find the split date by subtracting test_months from the max date 
    max_date = df['YearMonth'].max() 
    split_date = max_date - pd.DateOffset(months=test_months) 
    # Split the dataset 
    train_df = df[df['YearMonth'] <= split_date] 
    test_df = df[df['YearMonth'] > split_date] 
    # Drop the date column if no longer needed 
    train_df = train_df.drop(columns=['YearMonth']) 
    test_df = test_df.drop(columns=['YearMonth']) 
    
    return train_df, test_df 


test_months = 6 #  reserving the last n months of data for testing 
train, test = split_dataset(final_df, test_months) 

# AutoML setup
# Define settings dictionary
settings = { 
    "time_budget": 1200,
    "metric": "r2", 
    "estimator_list": ['xgboost'], 
    "task": 'regression', 
    "seed": 42, } 

# Initialize AutoML 
automl = AutoML() 

# Train the model 
automl.fit(X_train=train.drop('Shipped_Claim', axis=1), y_train=train['Shipped_Claim'], **settings) 
# Predictions 
preds = automl.predict(test.drop('Shipped_Claim', axis=1)) 
# Print the results 
print('Best hyperparameter config:', automl.best_config) 
print('Best r2 on validation data: {:.4g}'.format(1 - automl.best_loss)) 
print('Training duration of best run: {:.4g} s'.format(automl.best_config_train_time)) 
print(automl.model.estimator)

In [0]:
from sklearn import metrics 
# Calculate RMSE, R2, and MAE for training set 
train_true = train['Shipped_Claim'] 
train_preds = automl.predict(train.drop('Shipped_Claim', axis=1)) 
train_rmse = metrics.mean_squared_error(train_true, train_preds, squared=False) 
train_r2 = metrics.r2_score(train_true, train_preds) 
train_mae = metrics.mean_absolute_error(train_true, train_preds) 
# Calculate RMSE, R2, and MAE for test set 
test_true = test['Shipped_Claim'] 
test_preds = preds 
# assuming 'preds' contains your test set predictions from the previous step 
test_rmse = metrics.mean_squared_error(test_true, test_preds, squared=False) 
test_r2 = metrics.r2_score(test_true, test_preds) 
test_mae = metrics.mean_absolute_error(test_true, test_preds) 
# Print the metrics 
print(f"Training RMSE: {train_rmse}") 
print(f"Training R^2: {train_r2}") 
print(f"Training MAE: {train_mae}\n") 
print(f"Test RMSE: {test_rmse}") 
print(f"Test R^2: {test_r2}") 
print(f"Test MAE: {test_mae}")

In [0]:
import pandas as pd 
from datetime import datetime 
def split_dataset(df, test_months): 
    # Convert year and month to a datetime to ease sorting and manipulation
    df['YearMonth'] = pd.to_datetime(df['YearMonth'])
    df = df.sort_values(by='YearMonth') # Ensure the DataFrame is sorted by date 
    # Find the split date by subtracting test_months from the max date 
    max_date = df['YearMonth'].max() 
    split_date = max_date - pd.DateOffset(months=test_months) 
    # Split the dataset 
    train_df = df[df['YearMonth'] <= split_date] 
    test_df = df[df['YearMonth'] > split_date] 
    # Drop the date column if no longer needed 
    train_df = train_df.drop(columns=['YearMonth']) 
    test_df = test_df.drop(columns=['YearMonth']) 
    
    return train_df, test_df 


test_months = 6 #  reserving the last n months of data for testing 
train, test = split_dataset(final_df, test_months) 

# AutoML setup
# Define settings dictionary
settings = { 
    "time_budget": 1200,
    "metric": "r2", 
    "estimator_list": ['xgboost', 'lgbm', 'xgb_limitdepth'], 
    "task": 'regression', 
    "seed": 42, } 

# Initialize AutoML 
automl = AutoML() 

# Train the model 
automl.fit(X_train=train.drop('Shipped_Claim', axis=1), y_train=train['Shipped_Claim'], **settings) 
# Predictions 
preds = automl.predict(test.drop('Shipped_Claim', axis=1)) 
# Print the results 
print('Best hyperparameter config:', automl.best_config) 
print('Best r2 on validation data: {:.4g}'.format(1 - automl.best_loss)) 
print('Training duration of best run: {:.4g} s'.format(automl.best_config_train_time)) 
print(automl.model.estimator)

In [0]:
from sklearn import metrics 
# Calculate RMSE, R2, and MAE for training set 
train_true = train['Shipped_Claim'] 
train_preds = automl.predict(train.drop('Shipped_Claim', axis=1)) 
train_rmse = metrics.mean_squared_error(train_true, train_preds, squared=False) 
train_r2 = metrics.r2_score(train_true, train_preds) 
train_mae = metrics.mean_absolute_error(train_true, train_preds) 
# Calculate RMSE, R2, and MAE for test set 
test_true = test['Shipped_Claim'] 
test_preds = preds 
# assuming 'preds' contains your test set predictions from the previous step 
test_rmse = metrics.mean_squared_error(test_true, test_preds, squared=False) 
test_r2 = metrics.r2_score(test_true, test_preds) 
test_mae = metrics.mean_absolute_error(test_true, test_preds) 
# Print the metrics 
print(f"Training RMSE: {train_rmse}") 
print(f"Training R^2: {train_r2}") 
print(f"Training MAE: {train_mae}\n") 
print(f"Test RMSE: {test_rmse}") 
print(f"Test R^2: {test_r2}") 
print(f"Test MAE: {test_mae}")

In [0]:
test['forecast'] = preds

In [0]:
test

In [0]:
predictions = test.copy()

In [0]:
for col in cat_cols:
    predictions[col] = label_encoders[col].inverse_transform(test[col])

In [0]:
predictions