In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from google.colab import files
import holidays

In [2]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Read the sample data
data = pd.read_csv('/content/gdrive/MyDrive/Oxford - project/End to end/Project/rounded_modified_supply_chain_capstone.csv')

In [4]:
# Convert 'Date of Sale' to datetime
data['Date of Sale'] = pd.to_datetime(data['Date of Sale'])

In [5]:
# Extract date features
data['Year'] = data['Date of Sale'].dt.year.astype(int)
data['Month'] = data['Date of Sale'].dt.month.astype(int)
data['Day'] = data['Date of Sale'].dt.day.astype(int)
data['DayOfWeek'] = data['Date of Sale'].dt.dayofweek.astype(int)
data['WeekOfYear'] = data['Date of Sale'].dt.isocalendar().week.astype(int)

In [6]:
# Define seasons based on month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

data['Season'] = data['Month'].apply(get_season)

In [7]:
# Define holidays using the holidays library
uk_holidays = holidays.UnitedKingdom()

# Add is_holiday column based on 'Date of Sale'
data['is_holiday'] = data['Date of Sale'].apply(lambda x: 1 if x in uk_holidays else 0)

# Add is_weekend column
data['is_weekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

In [8]:
# Save the unique values for categorical features before one-hot encoding
unique_countries = data['Country'].unique()
unique_product_categories = data['Product Category'].unique()

In [9]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Country', 'Product Category', 'Season'])

In [10]:
# Select relevant features and target variable
features = [col for col in data.columns if col not in ['Units Sold']]
target = 'Units Sold'

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

In [12]:
# Scale the numerical features
scaler = MinMaxScaler()
numerical_features = ['Year', 'Month', 'Day', 'DayOfWeek', 'GDP Growth Rate', 'Inflation Rate', 'Price Sold']
X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

In [13]:
# Replace scaled numerical features in the training and testing sets
for i, feature in enumerate(numerical_features):
    X_train[feature] = X_train_scaled[:, i]
    X_test[feature] = X_test_scaled[:, i]

In [14]:
# Train a random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train.drop(columns=['Date of Sale']), y_train)

In [15]:
# Generate synthetic data
num_samples = 10000  # Number of synthetic samples to generate
synthetic_data = []

In [16]:
for _ in range(num_samples):
    # Generate random date
    random_date = pd.to_datetime(np.random.choice(data['Date of Sale']))

    # Create a synthetic sample with proper date features
    synthetic_sample = {
        'Date of Sale': random_date,
        'Year': random_date.year,
        'Month': random_date.month,
        'Day': random_date.day,
        'DayOfWeek': random_date.dayofweek,
        'WeekOfYear': random_date.isocalendar().week,
        'is_holiday': 1 if random_date in uk_holidays else 0,
        'is_weekend': 1 if random_date.dayofweek >= 5 else 0,
        'GDP Growth Rate': round(np.random.uniform(data['GDP Growth Rate'].min(), data['GDP Growth Rate'].max()),2),
        'Inflation Rate': round(np.random.uniform(data['Inflation Rate'].min(), data['Inflation Rate'].max()),2),
        'Price Sold': round(np.random.uniform(data['Price Sold'].min(), data['Price Sold'].max()),2),
        'Store ID': np.random.choice(data['Store ID'].unique()),  # Ensure Store ID is included
        'Product ID': np.random.choice(data['Product ID'].unique())  # Ensure Product ID is included
    }

    # Add one-hot encoded categorical variables
    for country in unique_countries:
        synthetic_sample[f'Country_{country}'] = 0
    chosen_country = np.random.choice(unique_countries)
    synthetic_sample[f'Country_{chosen_country}'] = 1

    for product_category in unique_product_categories:
        synthetic_sample[f'Product Category_{product_category}'] = 0
    chosen_product_category = np.random.choice(unique_product_categories)
    synthetic_sample[f'Product Category_{chosen_product_category}'] = 1

    for season in ['Spring', 'Summer', 'Autumn', 'Winter']:
        synthetic_sample[f'Season_{season}'] = 1 if get_season(random_date.month) == season else 0

    # Create a DataFrame for the synthetic sample
    synthetic_df_sample = pd.DataFrame([synthetic_sample])

    # Scale the numerical features
    synthetic_df_sample[numerical_features] = scaler.transform(synthetic_df_sample[numerical_features])

    # Predict the units sold using the trained model
    units_sold = rf.predict(synthetic_df_sample[features].drop(columns=['Date of Sale']))

    # Add the predicted units sold to the synthetic sample
    synthetic_sample['Units Sold'] = units_sold[0]

    synthetic_data.append(synthetic_sample)


In [17]:
# Create a synthetic dataset
synthetic_df = pd.DataFrame(synthetic_data)

In [18]:
# Convert one-hot encoded fields back to categorical values
def decode_one_hot(df, prefix):
    one_hot_columns = [col for col in df.columns if col.startswith(prefix)]
    df[prefix] = df[one_hot_columns].idxmax(axis=1).apply(lambda x: x.replace(prefix + '_', ''))
    df.drop(columns=one_hot_columns, inplace=True)

In [19]:
decode_one_hot(synthetic_df, 'Country')
decode_one_hot(synthetic_df, 'Product Category')
decode_one_hot(synthetic_df, 'Season')

In [20]:
# Rearrange columns to match original order, including Store ID and Date of Sale
columns = ['Store ID', 'Product ID', 'Country', 'Product Category', 'Units Sold', 'Date of Sale', 'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear', 'GDP Growth Rate', 'Inflation Rate', 'Price Sold', 'is_holiday', 'is_weekend', 'Season']
synthetic_df = synthetic_df[columns]

In [21]:
# Save the synthetic dataset to a CSV file
synthetic_df.to_csv('synthetic_supply_chain_data.csv', index=False)

In [22]:
# Download the file to your local machine
files.download('synthetic_supply_chain_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>