1. Checking the SFT datasets

In [None]:
import pandas as pd

df = pd.read_csv('SFS_metadata.csv')

print(df.head())
print(df.info())
print(df.columns)

2. Clean key columns (convert timestamps, drop/clean irrelevant columns, standardise tags, extract location)

In [None]:
import pandas as pd

def load_and_clean_data(file_path: str) -> pd.DataFrame:
    """Load and clean the fashion dataset."""
    df = pd.read_csv(file_path)

    df = df.dropna(subset=['time', 'tags'])

    df['time'] = pd.to_datetime(df['time'].str.replace('Updated on ', ''), errors='coerce')
    df = df.dropna(subset=['time'])

    df['tags'] = df['tags'].str.lower().str.strip(',')

    df['styles'] = df['styles'].fillna('').str.lower().str.strip(',')

    df['location'] = df['location'].str.strip().str.title()

    return df

def explode_tags(df: pd.DataFrame) -> pd.DataFrame:
    """Explode the tags column so each tag has its own row."""
    df_exploded = df.assign(tag=df['tags'].str.split(',')).explode('tag')
    df_exploded['tag'] = df_exploded['tag'].str.strip()
    return df_exploded

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add month-level time features for grouping."""
    df['month'] = df['time'].dt.to_period('M').dt.to_timestamp()
    return df

if __name__ == "__main__":
    file_path = "SFS_metadata.csv"
    df = load_and_clean_data(file_path)
    df = explode_tags(df)
    df = add_time_features(df)
    df.to_csv("SFT_clean_data.csv", index=False)
    print("Data cleaned and saved to 'cleaned_fashion_data.csv'")


3. Aggregate the trends based on monthly data so it will be easier to determine the popularity of items. Use up to 2 trends to train the ARIMA model (the same trends should be used on LSTM but that comes with additional data)

In [None]:
import pandas as pd

file_path = "/mnt/data/SFT_clean_data.csv"
df = pd.read_csv(file_path)

df['year'] = pd.to_datetime(df['time'], errors='coerce').dt.year

trends = ["zara dress", "boots", "oversized blazer", "denim jacket"]

df['tags'] = df['tags'].fillna("").str.lower()

trend_counts = {trend: [] for trend in trends}
years = sorted(df['year'].dropna().unique())

for year in years:
    df_year = df[df['year'] == year]
    for trend in trends:
        count = df_year['tags'].str.contains(trend, case=False, na=False).sum()
        trend_counts[trend].append(count)

trend_df = pd.DataFrame(trend_counts, index=years)
trend_df.index.name = "Year"

trend_df.reset_index(inplace=True)
print(trend_df)

4. Check per month. The dataset has no missing values for time/there is no need to imputate any values. There is 1,636,496 rows of data. I chose to analyse 4 main trends I noticed when I briefly skimmed the dataset (e.g., denim jacket, boots, chanel bag, zara dress). 

In [None]:
import pandas as pd

file_path = "/mnt/data/SFT_clean_data.csv"
df = pd.read_csv(file_path)

df['parsed_time'] = pd.to_datetime(df['time'], errors='coerce')
df['month'] = df['parsed_time'].dt.to_period('M').astype(str)

trends = ["zara dress", "boots", "chanel bag", "denim jacket"]

df['tags'] = df['tags'].fillna("").str.lower()

trend_counts = {trend: [] for trend in trends}
months = sorted(df['month'].dropna().unique())

for month in months:
    df_month = df[df['month'] == month]
    for trend in trends:
        count = df_month['tags'].str.contains(trend, case=False, na=False).sum()
        trend_counts[trend].append(count)

trend_df = pd.DataFrame(trend_counts, index=months)
trend_df.index.name = "Month"

trend_df.reset_index(inplace=True)
print(trend_df)

5. After computing a new dataset, I made line plots for all the four trends to compare them for the descriptive statististics part of the results section. Keep in mind I only need 2 trends to train both the ARIMA and LSTM so I am comparing the line curves between all to check which one resembles the product lifecycle curve most accurately.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

file_path = "SFT_clean_data.csv"
df = pd.read_csv(file_path)

df['parsed_time'] = pd.to_datetime(df['time'], errors='coerce')
df['month'] = df['parsed_time'].dt.to_period('M').astype(str)

trends = ["zara dress", "boots", "chanel bag", "denim jacket"]

df['tags'] = df['tags'].fillna("").str.lower()

trend_counts = {trend: [] for trend in trends}
months = sorted(df['month'].dropna().unique())

for month in months:
    df_month = df[df['month'] == month]
    for trend in trends:
        count = df_month['tags'].str.contains(trend, case=False, na=False).sum()
        trend_counts[trend].append(count)

trend_df = pd.DataFrame(trend_counts, index=months)
trend_df.index.name = "Month"
trend_df.reset_index(inplace=True)
trend_df.to_csv("trend_counts_over_time.csv", index=False)

plt.figure(figsize=(12, 6))
for trend in trends:
    plt.plot(trend_df["Month"], trend_df[trend], label=trend, linewidth=2)

plt.title("Fashion Trends Over Time")
plt.xlabel("Month")
plt.ylabel("Mentions in Tags")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.savefig("fashion_trends_plot.png")
plt.close()

print("Trend counts saved to 'trend_counts_over_time.csv'")
print("Plot saved as 'fashion_trends_plot.png'")

6. descriptive statistics

In [None]:
import pandas as pd

file_path = "trend_counts_over_time.csv"  # Update this path if needed
df = pd.read_csv(file_path)

desc_stats = df[['zara dress', 'chanel bag']].describe().T

desc_stats['median'] = df[['zara dress', 'chanel bag']].median()
desc_stats['mode'] = df[['zara dress', 'chanel bag']].mode().iloc[0]

desc_stats = desc_stats[['count', 'mean', 'std', 'min', '25%', '50%', 'median', 'mode', '75%', 'max']]

desc_stats = desc_stats.round(2)

desc_stats.to_csv("descriptive_stats.csv", index=False)

print("Descriptive Statistics Table:")
print(desc_stats)


7. Run ADF test to check stationarity and then differenciate data (also fixed format of month)

In [None]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

df = pd.read_csv("trend_counts_over_time.csv")

df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)

def adf_test(series, name):
    result = adfuller(series.dropna(), autolag='AIC')
    print(f'ADF Test for "{name}"')
    print(f'  ADF Statistic: {result[0]:.4f}')
    print(f'  p-value: {result[1]:.4f}')
    for key, value in result[4].items():
        print(f'     Critical Value ({key}): {value:.4f}')
    print('  =>', 'Stationary' if result[1] <= 0.05 else 'Non-stationary')
    print('-' * 40)

for column in df.columns:
    adf_test(df[column], column)

In [None]:
import pandas as pd

df = pd.read_csv("trend_counts_over_time.csv")

df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)

df_diff = df.diff().dropna()

df_diff.to_csv("trend_counts_differenced.csv")

print("Differenced dataset saved to 'trend_counts_differenced.csv'")
print(df_diff.head())

8. Finding the outfits most popular location

In [None]:
import pandas as pd

df = pd.read_csv('SFS_metadata.csv') 

df['tags'] = df['tags'].astype(str).str.lower()
df['user_name'] = df['user_name'].astype(str)
df['location'] = df['location'].astype(str)

zara_rows = df[df['tags'].str.contains('zara dress', na=False)]
chanel_rows = df[df['tags'].str.contains('chanel bag', na=False)]

top_locations_zara = zara_rows['location'].value_counts().head(3)
top_locations_chanel = chanel_rows['location'].value_counts().head(3)

print("Zara dress photo locations (Top 3):")
print(top_locations_zara)

print("Chanel bag photo locations (Top 3):")
print(top_locations_chanel)

9. Clean weather dataset for California and Aggregate the columns temperature and rainfall to get an average monthly number

In [None]:
import pandas as pd

file_path = 'California_weather.csv'

df = pd.read_csv(file_path, skiprows=1, delimiter=';')

df['dt_iso'] = df['dt_iso'].str.replace(' UTC', '', regex=False)
df['dt_iso'] = pd.to_datetime(df['dt_iso'])

df['rain_1h'] = df['rain_1h'].fillna(0)

df['year_month'] = df['dt_iso'].dt.to_period('M')

monthly_df = df.groupby('year_month').agg({
    'temp': 'mean',
    'rain_1h': 'mean'
}).reset_index()

monthly_df.rename(columns={'temp': 'avg_monthly_temp', 'rain_1h': 'avg_monthly_rain'}, inplace=True)
monthly_df.to_csv('monthly_temp_rain.csv', index=False)

print(monthly_df.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

monthly_df = pd.read_csv('monthly_temp_rain.csv')

monthly_df['year_month'] = monthly_df['year_month'].astype(str)

plt.figure(figsize=(14, 6))

plt.plot(monthly_df['year_month'], monthly_df['avg_monthly_temp'], label='Avg Monthly Temp (°C)', marker='o')

plt.plot(monthly_df['year_month'], monthly_df['avg_monthly_rain'], label='Avg Monthly Rain (mm)', marker='s')

plt.title('Average Monthly Temperature and Rainfall in California')
plt.xlabel('Month')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()

plt.show()

10. 4-week moving average to smooth out the noise in Google Trends data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('google_trends.csv', skiprows=1, delimiter=';')
df['Month'] = pd.to_datetime(df['Month'])

df['Zara_MA'] = df['Zara Dress'].rolling(window=4).mean()
df['Chanel_MA'] = df['Chanel Bag'].rolling(window=4).mean()

plt.figure(figsize=(16, 8))

plt.plot(df['Month'], df['Zara Dress'], label='Zara Dress (Original)', alpha=0.3, color='blue')
plt.plot(df['Month'], df['Zara_MA'], label='Zara Dress (4-Month MA)', linewidth=2.5, color='blue')

plt.plot(df['Month'], df['Chanel Bag'], label='Chanel Bag (Original)', alpha=0.3, color='green')
plt.plot(df['Month'], df['Chanel_MA'], label='Chanel Bag (4-Month MA)', linewidth=2.5, color='green')

plt.title('Google Trends: Zara Dress vs Chanel Bag (4-Month Moving Average)', fontsize=16, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Search Interest')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()

plt.show()

11. Merged all of the datasets

In [None]:
import pandas as pd

gt_df = pd.read_csv('google_trends.csv', skiprows=1, delimiter=';')
weather_df = pd.read_csv('monthly_temp_rain.csv')
sft_df = pd.read_csv('trend_counts_over_time.csv')

gt_df['Month'] = pd.to_datetime(gt_df['Month'])
weather_df['year_month'] = pd.to_datetime(weather_df['year_month'].astype(str))
sft_df['Month'] = pd.to_datetime(sft_df['Month'])

weather_df.rename(columns={'year_month': 'Month'}, inplace=True)
gt_df.rename(columns={'Zara Dress': 'Zara Dress Search Interest', 'Chanel Bag': 'Chanel Bag Search Interest'}, inplace=True)

merged_df = pd.merge(gt_df, weather_df, on='Month', how='inner')
merged_df = pd.merge(merged_df, sft_df, on='Month', how='inner')

merged_df.to_csv('merged_data.csv', index= False)

print(merged_df.head())
print("Saved as 'merged_data.csv'")

In [None]:
import pandas as pd

gt_df = pd.read_csv('google_trends.csv', skiprows=1, delimiter=';')
sft_df = pd.read_csv('trend_counts_over_time.csv')

gt_df['Month'] = pd.to_datetime(gt_df['Month'])
sft_df['Month'] = pd.to_datetime(sft_df['Month'])

gt_df.rename(columns={'Zara Dress': 'Zara Dress Search Interest', 'Chanel Bag': 'Chanel Bag Search Interest'}, inplace=True)

merged_df = pd.merge(merged_df, sft_df, on='Month', how='inner')

merged_df.to_csv('merged_data_no_weather.csv', index= False)

print(merged_df.head())
print("Saved as 'merged_data_no_weather.csv'")

12. LSTM on all merged data (Zara Dress)

In [None]:

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("merged_data.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['Zara Dress Search Interest', 'Chanel Bag Search Interest', 'avg_monthly_temp', 'avg_monthly_rain']
target = 'zara dress'

cutoff_date = pd.Timestamp('2013-10-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on merged data: {rmse:.2f}")

13. LSTM on merged data with no weather (Zara dress)

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("merged_data_no_weather.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['Zara Dress Search Interest', 'Chanel Bag Search Interest']
target = 'zara dress'

cutoff_date = pd.Timestamp('2013-10-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on merged data: {rmse:.2f}")

14. LSTM on only SFS dataset (Zara dress)

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("trend_counts_over_time.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['zara dress', 'chanel bag']
target = 'zara dress'

cutoff_date = pd.Timestamp('2013-10-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on data: {rmse:.2f}")

15. LSTM on all merged data (Chanel bag)

In [None]:

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("merged_data.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['Zara Dress Search Interest', 'Chanel Bag Search Interest', 'avg_monthly_temp', 'avg_monthly_rain']
target = 'chanel bag'

cutoff_date = pd.Timestamp('2014-05-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on merged data: {rmse:.2f}")

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("merged_data_no_weather.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['Zara Dress Search Interest', 'Chanel Bag Search Interest']
target = 'chanel bag'

cutoff_date = pd.Timestamp('2014-05-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on merged data: {rmse:.2f}")

16. LSTM on merged data with no weather (Chanel bag)

17. LSTM on only SFS dataset (Chanel bag)

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

df = pd.read_csv("trend_counts_over_time.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

features = ['zara dress', 'chanel bag']
target = 'chanel bag'

cutoff_date = pd.Timestamp('2014-05-01')
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(train_df[features])
y_train_scaled = scaler_y.fit_transform(train_df[[target]])
X_test_scaled = scaler_X.transform(test_df[features])
y_test_scaled = scaler_y.transform(test_df[[target]])

def create_sequences(X, y, seq_length=6):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 6
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, seq_length)

model = Sequential()
model.add(LSTM(32, activation='relu', input_shape=(seq_length, len(features))))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X_train_seq, y_train_seq, epochs=100, batch_size=8, verbose=1)

y_pred_scaled = model.predict(X_test_seq)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_seq)

forecast_dates = test_df.index[seq_length: seq_length + len(y_pred)]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df[target], label='Actual (Merged Data)', color='black')
plt.plot(forecast_dates, y_pred.flatten(), label='LSTM Forecast', linestyle='--', color='red')
plt.title(f"LSTM Forecast on Merged Dataset: {target.title()}")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
print(f"LSTM RMSE on data: {rmse:.2f}")

18. ARIMA on Zara dress

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from io import StringIO

df = pd.read_csv("trend_counts_over_time.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

target = 'zara dress'

cutoff_date = pd.Timestamp('2013-10-01')
train = df[target][:cutoff_date - pd.DateOffset(months=1)]
test = df[target][cutoff_date:]

model = ARIMA(train, order=(1, 1, 1))
fitted = model.fit()

forecast = fitted.forecast(steps=len(test))

y_test_actual = test
y_pred = forecast

rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
mae = mean_absolute_error(y_test_actual, y_pred)

print(f"ARIMA RMSE: {rmse:.2f}")
print(f"ARIMA MAE:  {mae:.2f}")

summary_df = fitted.summary().tables[1]
html_str = summary_df.as_html()
summary_df_as_df = pd.read_html(StringIO(html_str), header=0, index_col=0)[0]
summary_df_as_df.to_csv("arima_output_chanel.csv")

print(fitted.summary())

plt.figure(figsize=(10, 5))
plt.plot(df[target], label='Actual (SFT Dataset)', color='black')
plt.plot(test.index, forecast, label='ARIMA Forecast', linestyle='--', color='red')
plt.title(f"Forecasting Saturation: {target.title()} (ARIMA)")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
SARIMAX Results                                
==============================================================================
Dep. Variable:             zara dress   No. Observations:                   67
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -344.639
Date:                Wed, 30 Apr 2025   AIC                            695.277
Time:                        12:57:51   BIC                            701.846
Sample:                    03-01-2008   HQIC                           697.873
                         - 09-01-2013                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0427      0.197      0.217      0.828      -0.343       0.429
ma.L1         -0.5734      0.155     -3.707      0.000      -0.877      -0.270
sigma2      1999.0356    388.290      5.148      0.000    1238.002    2760.069
===================================================================================
Ljung-Box (L1) (Q):                   0.08   Jarque-Bera (JB):                 1.13
Prob(Q):                              0.78   Prob(JB):                         0.57
Heteroskedasticity (H):               4.15   Skew:                            -0.30
Prob(H) (two-sided):                  0.00   Kurtosis:                         2.81
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).


In [None]:
19. ARIMA on Chanel bag

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from io import StringIO

df = pd.read_csv("trend_counts_over_time.csv")
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.asfreq('MS')

target = 'chanel bag'

cutoff_date = pd.Timestamp('2014-05-01')
train = df[target][:cutoff_date - pd.DateOffset(months=1)]
test = df[target][cutoff_date:]

model = ARIMA(train, order=(1, 1, 1))
fitted = model.fit()

forecast = fitted.forecast(steps=len(test))

y_test_actual = test
y_pred = forecast

rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
mae = mean_absolute_error(y_test_actual, y_pred)

print(f"ARIMA RMSE: {rmse:.2f}")
print(f"ARIMA MAE:  {mae:.2f}")

summary_df = fitted.summary().tables[1]
html_str = summary_df.as_html()
summary_df_as_df = pd.read_html(StringIO(html_str), header=0, index_col=0)[0]
summary_df_as_df.to_csv("arima_output_chanel.csv")

print(fitted.summary())

plt.figure(figsize=(10, 5))
plt.plot(df[target], label='Actual (SFT Dataset)', color='black')
plt.plot(test.index, forecast, label='ARIMA Forecast', linestyle='--', color='red')
plt.title(f"Forecasting Saturation: {target.title()} (ARIMA)")
plt.xlabel("Date")
plt.ylabel("Trend Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
                               SARIMAX Results                                
==============================================================================
Dep. Variable:             chanel bag   No. Observations:                   74
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -378.091
Date:                Tue, 06 May 2025   AIC                            762.182
Time:                        12:51:06   BIC                            769.053
Sample:                    03-01-2008   HQIC                           764.920
                         - 04-01-2014                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.3972      0.195     -2.041      0.041      -0.779      -0.016
ma.L1         -0.1486      0.176     -0.846      0.398      -0.493       0.196
sigma2      1838.2442    299.737      6.133      0.000    1250.770    2425.719
===================================================================================
Ljung-Box (L1) (Q):                   0.24   Jarque-Bera (JB):                 0.09
Prob(Q):                              0.62   Prob(JB):                         0.95
Heteroskedasticity (H):               7.36   Skew:                            -0.01
Prob(H) (two-sided):                  0.00   Kurtosis:                         3.17
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
