# Import and drop redundant columns

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine
from scipy.stats import zscore
import matplotlib.pyplot as plt
import joblib
import logging
logging.basicConfig(level=logging.INFO)


In [3]:
class SalesPerformancePrediction:
    def __init__(self):
        self.label_encoders = {}
        self.models = {}
        self.scaler = StandardScaler()

    def fetch_data_from_sql(self, query, server, database):
        """
        Fetch data from SQL Server using SQLAlchemy
        """
        logging.info("Fetching data from SQL Server...")
        engine = create_engine(f"mssql+pyodbc://{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server")
        data = pd.read_sql_query(query, engine)
        return data

    def prepare_data(self, fact_sales, dim_product, dim_date, dim_territory):
        """
        Merge dimension tables with fact_sales and calculate metrics
        """
        logging.info("Merging dimension tables with fact_sales...")
        sales_data = fact_sales.merge(dim_product, on='ProductKey')\
                            .merge(dim_date, on='DateKey')\
                            .merge(dim_territory, on='TerritoryKey')


        # Handle missing and invalid values
        logging.info("Handling missing and invalid values...")
        sales_data.replace([np.inf, -np.inf], np.nan, inplace=True)
        sales_data.fillna(0, inplace=True)


        # Remove outliers
        logging.info("Removing outliers...")
        sales_data['Revenue'] = sales_data['OrderQty'] * sales_data['UnitPrice']
        sales_data = sales_data[(np.abs(zscore(sales_data['Revenue'])) < 3)]
        sales_data = sales_data.drop(['Revenue'], axis=1)

        return sales_data

In [4]:
# Database Connection Details
server = "DESKTOP-TGOL65T"
database = "CompanyXdwh_real"

# Queries
fact_sales_query = "SELECT * FROM fact_sales"
dim_product_query = "SELECT * FROM dim_product"
dim_date_query = "SELECT * FROM dim_date"
dim_territory_query = "SELECT * FROM dim_territory"

# Initialize Class
prediction = SalesPerformancePrediction()

# Fetch Data
fact_sales = prediction.fetch_data_from_sql(fact_sales_query, server, database)
dim_product = prediction.fetch_data_from_sql(dim_product_query, server, database)
dim_date = prediction.fetch_data_from_sql(dim_date_query, server, database)
dim_territory = prediction.fetch_data_from_sql(dim_territory_query, server, database)

# Prepare Data
sales_data = prediction.prepare_data(fact_sales, dim_product, dim_date, dim_territory)


INFO:root:Fetching data from SQL Server...
INFO:root:Fetching data from SQL Server...
INFO:root:Fetching data from SQL Server...
INFO:root:Fetching data from SQL Server...
INFO:root:Merging dimension tables with fact_sales...
INFO:root:Handling missing and invalid values...
INFO:root:Removing outliers...


In [5]:
print(sales_data[sales_data.duplicated()])

Empty DataFrame
Columns: [SalesKey, SalesOrderID, ProductKey, DateKey, TerritoryKey, StoreKey, CustomerKey, OrderQty, UnitPrice, LineTotal, TotalDue, ModifiedDate_x, ProductID, ProductName, Category, Subcategory, Model, ListPrice, StandardCost, ModifiedDate_y, Day, Week, Month, Quarter, Year, TerritoryID, Name, Country, Group, SalesYTD, SalesLastYear, ModifiedDate]
Index: []

[0 rows x 32 columns]


In [6]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118634 entries, 0 to 121241
Data columns (total 32 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   SalesKey        118634 non-null  int64         
 1   SalesOrderID    118634 non-null  object        
 2   ProductKey      118634 non-null  int64         
 3   DateKey         118634 non-null  int64         
 4   TerritoryKey    118634 non-null  int64         
 5   StoreKey        118634 non-null  float64       
 6   CustomerKey     118634 non-null  float64       
 7   OrderQty        118634 non-null  int64         
 8   UnitPrice       118634 non-null  float64       
 9   LineTotal       118634 non-null  float64       
 10  TotalDue        118634 non-null  float64       
 11  ModifiedDate_x  118634 non-null  datetime64[ns]
 12  ProductID       118634 non-null  int64         
 13  ProductName     118634 non-null  object        
 14  Category        118634 non-null  object  

In [21]:
sales_data = sales_data.drop(['SalesKey', 'ProductKey', 'DateKey', 'TerritoryKey',
                             'StoreKey', 'CustomerKey', 'ProductID',
                             'ModifiedDate_x', 'ModifiedDate_y', 'ModifiedDate'],axis=1)
print(sales_data[sales_data.duplicated()])

Empty DataFrame
Columns: [SalesOrderID, OrderQty, UnitPrice, LineTotal, TotalDue, ProductName, Category, Subcategory, Model, ListPrice, StandardCost, Day, Week, Month, Quarter, Year, TerritoryID, Name, Country, Group, SalesYTD, SalesLastYear]
Index: []

[0 rows x 22 columns]


In [22]:
sales_data.columns

Index(['SalesOrderID', 'OrderQty', 'UnitPrice', 'LineTotal', 'TotalDue',
       'ProductName', 'Category', 'Subcategory', 'Model', 'ListPrice',
       'StandardCost', 'Day', 'Week', 'Month', 'Quarter', 'Year',
       'TerritoryID', 'Name', 'Country', 'Group', 'SalesYTD', 'SalesLastYear'],
      dtype='object')

In [23]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118634 entries, 0 to 121241
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   SalesOrderID   118634 non-null  object 
 1   OrderQty       118634 non-null  int64  
 2   UnitPrice      118634 non-null  float64
 3   LineTotal      118634 non-null  float64
 4   TotalDue       118634 non-null  float64
 5   ProductName    118634 non-null  object 
 6   Category       118634 non-null  object 
 7   Subcategory    118634 non-null  object 
 8   Model          118634 non-null  object 
 9   ListPrice      118634 non-null  float64
 10  StandardCost   118634 non-null  float64
 11  Day            118634 non-null  int64  
 12  Week           118634 non-null  int64  
 13  Month          118634 non-null  int64  
 14  Quarter        118634 non-null  int64  
 15  Year           118634 non-null  int64  
 16  TerritoryID    118634 non-null  int64  
 17  Name           118634 non-null  ob

In [24]:
sales_data.head()

Unnamed: 0,SalesOrderID,OrderQty,UnitPrice,LineTotal,TotalDue,ProductName,Category,Subcategory,Model,ListPrice,...,Week,Month,Quarter,Year,TerritoryID,Name,Country,Group,SalesYTD,SalesLastYear
0,43697-353,1,3578.27,3578.27,3953.9884,"Road-150 Red, 62",Bikes,Road Bikes,Road-150,3578.27,...,23,5,2,2011,6,Canada,Canada,North America,6771829.0,5693989.0
1,43698-354,1,3399.99,3399.99,3756.989,"Mountain-100 Silver, 44",Bikes,Mountain Bikes,Mountain-100,3399.99,...,23,5,2,2011,7,France,France,Europe,4772398.0,2396540.0
2,43699-355,1,3399.99,3399.99,3756.989,"Mountain-100 Silver, 44",Bikes,Mountain Bikes,Mountain-100,3399.99,...,23,5,2,2011,1,Northwest,United States,North America,7887187.0,3298694.0
3,43700-356,1,699.0982,699.0982,772.5036,"Road-650 Black, 62",Bikes,Road Bikes,Road-650,782.99,...,23,5,2,2011,4,Southwest,United States,North America,10510850.0,5366576.0
4,43701-357,1,3399.99,3399.99,3756.989,"Mountain-100 Silver, 44",Bikes,Mountain Bikes,Mountain-100,3399.99,...,23,5,2,2011,9,Australia,Australia,Pacific,5977815.0,2278549.0


#  Feature Engineering

In [None]:
class FeatureEngineering:
    def __init__(self, df):
        self.df = df
        
        
    def add_date_features(self):
        """
        Add date-related features like Day of Week, Is Weekend, and Month Name.
        """
        self.df['Date'] = pd.to_datetime(self.df[['Year', 'Month', 'Day']])
        self.df['Day_of_Week'] = self.df['Date'].dt.day_name()
        self.df['Is_Weekend'] = self.df['Day_of_Week'].isin(['Saturday', 'Sunday']).astype(int)
        return self
    
    
    def add_cyclical_features(self):
        """
        Add cyclical features for month and day using sine and cosine transformations.
        """
        self.df['Month_Sin'] = np.sin(2 * np.pi * self.df['Month'] / 12)
        self.df['Month_Cos'] = np.cos(2 * np.pi * self.df['Month'] / 12)
        self.df['Day_Sin'] = np.sin(2 * np.pi * self.df['Day'] / 31)
        self.df['Day_Cos'] = np.cos(2 * np.pi * self.df['Day'] / 31)
        return self
    
    
    def get_data(self):
        """
        Return the engineered DataFrame.
        """
        return self.df
    
    


In [31]:
from datetime import datetime

def derive_week(day, month, year):
    """
    Derives the ISO week number from day, month, and year.

    Parameters:
        day (int): The day of the month (1-31).
        month (int): The month of the year (1-12).
        year (int): The year (e.g., 2023).

    Returns:
        int: The ISO week number (1-52 or 1-53 depending on the year).
    """
    try:
        date = datetime(year, month, day)
        week = date.isocalendar().week
        return week
    except ValueError as e:
        print(f"Invalid date: {e}")
        return None

In [32]:
# drop columns Week and Quarter because its can be deduced from day, month and year
sales_data_data = sales_data.drop(['Quarter', 'Week'],axis=1)
print(sales_data[sales_data.duplicated()])

Empty DataFrame
Columns: [SalesOrderID, OrderQty, UnitPrice, LineTotal, TotalDue, ProductName, Category, Subcategory, Model, ListPrice, StandardCost, Day, Week, Month, Quarter, Year, TerritoryID, Name, Country, Group, SalesYTD, SalesLastYear]
Index: []

[0 rows x 22 columns]
