In [63]:
DATA_PATH = "/home/david/code/datathon24-personal/data/skylab_instagram_datathon_dataset.csv"
# DATA_PATH = "/Users/flohmann/Documents/ETH/FS2024/datathon24/skylab_instagram_datathon_dataset.csv"
import pandas as pd 
import numpy as np

df = pd.read_csv(DATA_PATH, sep=";")


def video_to_picture_ratio(df, videos_col, pictures_col):
    df['video_picture_ratio'] = df[videos_col] / df[pictures_col]
    return df


def calculate_rolling_average_per_brand(data, brand_column, window_size=7):
    """
    Calculate the rolling average of 'engagement' for each brand in a DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        brand_column (str): The column name which identifies the brand.
        window_size (int): The number of observations used for calculating the rolling average.

    Returns:
        pd.DataFrame: The DataFrame with a new column for the rolling average of 'engagement' calculated per brand.
    """
    # Group by brand and calculate rolling average within each group
    data['rolling_avg_engagement'] = data.groupby(brand_column)['engagement'].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    return data


def calculate_exponential_moving_average_per_brand(data, brand_column, span=7):
    """
    Calculate the exponential moving average of 'engagement' for each brand in a DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        brand_column (str): The column name which identifies the brand.
        span (int): The decay in terms of the span of the exponential window.

    Returns:
        pd.DataFrame: The DataFrame with a new column for the exponential moving average of 'engagement' calculated per brand.
    """
    # Group by brand and calculate EMA within each group
    data['ewma_engagement'] = data.groupby(brand_column)['engagement'].transform(lambda x: x.ewm(span=span, adjust=False).mean())
    return data


def calculate_brand_wise_growth_rates(data, column_names, brand_column):
    """
    Calculate the growth rate for specified columns in a DataFrame, grouped by brand.
    Replace NaN values with 0 where growth rate cannot be calculated.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        column_names (list of str): A list of column names to calculate the growth rate.
        brand_column (str): The column name which identifies the brand.

    Returns:
        pd.DataFrame: The DataFrame with new columns for each specified column's growth rate, calculated for each brand and NaN replaced by 0.
    """
    for column in column_names:
        # Calculate growth rate within each brand group
        data[f'growth_rate_{column}'] = data.groupby(brand_column)[column].pct_change() * 100
        # Replace NaN values with 0
        data[f'growth_rate_{column}'].fillna(0, inplace=True)
    return data

def calculate_brand_rolling_statistics(data, column_name, brand_column, window_size=7):
    """
    Calculate rolling statistics for a specified column in a DataFrame, grouped by brand,
    and replace NaN values with 0.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The column name to calculate rolling statistics.
        brand_column (str): The column name which identifies the brand.
        window_size (int): The number of observations used for calculating the rolling statistic.

    Returns:
        pd.DataFrame: The DataFrame with new columns for each rolling statistic of the specified column, calculated for each brand.
    """
    grouped = data.groupby(brand_column)[column_name]
    data[f'{column_name}_rolling_min'] = grouped.transform(lambda x: x.rolling(window=window_size, min_periods=1).min()).fillna(0)
    data[f'{column_name}_rolling_max'] = grouped.transform(lambda x: x.rolling(window=window_size, min_periods=1).max()).fillna(0)
    data[f'{column_name}_rolling_std'] = grouped.transform(lambda x: x.rolling(window=window_size, min_periods=1).std()).fillna(0)

    return data

def create_brand_lag_features(data, column_name, brand_column, lag_periods):
    """
    Create lag features for a specified column in a DataFrame, grouped by brand,
    and replace NaN values with 0.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The column name to create lag features for.
        brand_column (str): The column name which identifies the brand.
        lag_periods (int): The number of lag periods.

    Returns:
        pd.DataFrame: The DataFrame with new columns for each lag feature of the specified column, calculated for each brand.
    """
    for i in range(1, lag_periods + 1):
        data[f'{column_name}_lag_{i}'] = data.groupby(brand_column)[column_name].shift(i).fillna(0)

    return data


def preprocess_data(df, missing_values_cutoff=0.7, test_fraction=0.2):

    df = df.drop(columns=["period", "calculation_type", "compset", "compset_group", "legal_entity_name", "ultimate_parent_legal_entity_name", "primary_exchange_name"])
    df["period_end_date"] = pd.to_datetime(df["period_end_date"])

    df = df.rename(columns={'business_entity_doing_business_as_name': 'brand', 'period_end_date': 'date'})

    # Remove specific brands, invalid or with too much missing data
    df = df[df['brand'] != "All Brands"]
    df = df[df['brand'] != "Boca"]
    df = df[df['brand'] != "Bulgari Beauty"]

    df = df.groupby(['brand', 'date']).first().reset_index()

    def correct_country_name(name):
        country_map = {
            "Hong Kong": "China",
            "China;Hong Kong": "China",
            ";France": "France",
            ";": None,
            "Belgium;": "Belgium"
        }
        if name in country_map.keys():
            return country_map[name]
        return name

    df['domicile_country_name'] = df['domicile_country_name'].apply(correct_country_name)


    categorical_features = ["domicile_country_name"]
    for feature in categorical_features:
        df = pd.get_dummies(df, columns=[feature], prefix=feature, dummy_na=True, dtype=int)

    # TODO: remove bad data
    na_frac = df[['brand', 'followers', 'pictures',
        'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

    bad_brands = list(na_frac[na_frac.max(axis=1) > missing_values_cutoff].index)

    df = df[~df['brand'].isin(bad_brands)]


    # TODO: add additional features
    df['engagement'] = df['comments'] + df['likes']
    df['engagement_rate'] = df['engagement']/df['followers']
    df['engagement_rate_per_post'] =  df['engagement_rate']/(df['videos'] + df['pictures'])


    # df = video_to_picture_ratio(df, "videos", "pictures")
    df = calculate_rolling_average_per_brand(df, "brand", window_size=4)
    df = calculate_exponential_moving_average_per_brand(df, "brand", span=4)
    # df = calculate_brand_wise_growth_rates(df, ["comments", "likes", "followers", "pictures", "videos"], "brand")
    for c in ["comments", "likes", "followers", "pictures", "videos"]:
        df = calculate_brand_rolling_statistics(df, c, "brand", window_size=4)
        df = create_brand_lag_features(df, c, "brand", 4)

    
    # Step 1: Sort the dataframe by time
    df_sorted = df.sort_values(by='date')

    # Step 2: Group the dataframe by 'Brand'
    grouped = df_sorted.groupby('brand')

    # Step 3: Define an empty dataframe for train and test sets
    train_df = pd.DataFrame(columns=df.columns)  # Columns same as original dataframe
    test_df = pd.DataFrame(columns=df.columns)   # Columns same as original dataframe

    # Step 4: Iterate over each group and split into train and test sets
    for _, group in grouped:
        n_rows = len(group)
        n_test = int(test_fraction * n_rows)  # 20% of rows for test set

        # Add last 20% of rows to test set
        test_df = pd.concat([test_df, group.iloc[-n_test:]])

        # Add remaining rows to train set
        train_df = pd.concat([train_df, group.iloc[:-n_test]])

    # Step 5: Reset index for both train and test dataframes
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)


    # normalize values
    normalize_cols = [c for c in train_df.columns if c not in ['brand', 'date'] and not c.startswith("domicile_country_name")]
    for col in normalize_cols:
        m = train_df[col].mean()
        s = train_df[col].std()
        train_df[col] = (train_df[col] - m)/s
        test_df[col] = (test_df[col] - m)/s


  
    # impute missing values
    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    train_df = train_df.reset_index(drop=True)

    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    train_df = train_df.reset_index(drop=True)


    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    test_df = test_df.reset_index(drop=True)

    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    test_df = test_df.reset_index(drop=True)



    return train_df, test_df


def prepare_data_lstm(df, sequence_length=10, prediction_dist=4, missing_values_cutoff=0.7, test_fraction=0.2):

    df = df.drop(columns=["period", "calculation_type", "compset", "compset_group", "legal_entity_name", "ultimate_parent_legal_entity_name", "primary_exchange_name"])
    df["period_end_date"] = pd.to_datetime(df["period_end_date"])

    df = df.rename(columns={'business_entity_doing_business_as_name': 'brand', 'period_end_date': 'date'})

    df = df[df['brand'] != "All Brands"]
    df = df[df['brand'] != "Boca"]
    df = df[df['brand'] != "Bulgari Beauty"]

    df = df.groupby(['brand', 'date']).first().reset_index()

    def correct_country_name(name):
        country_map = {
            "Hong Kong": "China",
            "China;Hong Kong": "China",
            ";France": "France",
            ";": None,
            "Belgium;": "Belgium"
        }
        if name in country_map.keys():
            return country_map[name]
        return name
    
    df = df.drop(columns=['domicile_country_name'])


    # remove bad data
    na_frac = df[['brand', 'followers', 'pictures',
        'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

    bad_brands = list(na_frac[na_frac.max(axis=1) > missing_values_cutoff].index)

    df = df[~df['brand'].isin(bad_brands)]


    # compute labels
    df['engagement'] = df['comments'] + df['likes']
    df['engagement_rate'] = df['engagement']/df['followers']
    df['engagement_rate_per_post'] =  df['engagement_rate']/(df['videos'] + df['pictures'])


    for c in ["comments", "likes", "followers", "pictures", "videos"]:
        df = calculate_brand_rolling_statistics(df, c, "brand", window_size=4)
        df = create_brand_lag_features(df, c, "brand", 4)

    # Sort the dataframe by time
    df_sorted = df.sort_values(by='date')

    # Group the dataframe by 'Brand'
    grouped = df_sorted.groupby('brand')

    # Define an empty dataframe for train and test sets
    train_df = pd.DataFrame(columns=df.columns)  # Columns same as original dataframe
    test_df = pd.DataFrame(columns=df.columns)   # Columns same as original dataframe

    # Iterate over each group and split into train and test sets
    for _, group in grouped:
        n_rows = len(group)
        n_test = int(test_fraction * n_rows)  # 20% of rows for test set

        # Add last 20% of rows to test set
        test_df = pd.concat([test_df, group.iloc[-n_test:]])

        # Add remaining rows to train set
        train_df = pd.concat([train_df, group.iloc[:-n_test]])

    # Step 5: Reset index for both train and test dataframes
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)


    # normalize values

    normalize_cols = [c for c in train_df.columns if c not in ['brand', 'date'] and not c.startswith("domicile_country_name")]
    for col in normalize_cols:
        m = train_df[col].mean()
        s = train_df[col].std()
        train_df[col] = (train_df[col] - m)/s
        test_df[col] = (test_df[col] - m)/s


    # impute missing values
    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    train_df = train_df.reset_index(drop=True)

    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    train_df = train_df.reset_index(drop=True)


    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    test_df = test_df.reset_index(drop=True)

    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    test_df = test_df.reset_index(drop=True)


    label_col = "engagement_rate_per_post"

    train_sequences = []
    train_labels = []
    test_sequences = []
    test_labels = []

    test_brands = []

    for bi, brand in enumerate(df['brand'].unique()):

        brand_train_df = train_df[train_df['brand'] == brand]

        cols = [c for c in train_df.columns if (c != "brand" and c != "date")]
        brand_train_df = brand_train_df[cols]

        for i in range(len(brand_train_df) - (sequence_length + prediction_dist)):
            

            sequence_labels = brand_train_df.iloc[i+sequence_length+prediction_dist][label_col]
            train_labels.append(sequence_labels)

            sequence = brand_train_df.iloc[i:i + sequence_length]

            # remove labels
            sequence = sequence.drop(columns=["engagement", "engagement_rate", "engagement_rate_per_post"])
            
            sequence = sequence.values
            train_sequences.append(sequence)
        
        brand_test_df = test_df[test_df['brand'] == brand]

        brand_test_df = brand_test_df[cols]


        for i in range(len(brand_test_df) - (sequence_length + prediction_dist)):

            sequence_labels = brand_test_df.iloc[i+sequence_length+prediction_dist][label_col]

            sequence = brand_test_df.iloc[i:i + sequence_length]

            # remove labels
            sequence = sequence.drop(columns=["engagement", "engagement_rate", "engagement_rate_per_post"])

            sequence = sequence.values
            if (np.isnan(sequence).any()):
                continue

            if np.isnan(sequence_labels).any():
                continue 
            
            test_sequences.append(sequence)

            test_labels.append(sequence_labels)

            test_brands.append(brand)


    return train_sequences, train_labels, test_sequences, test_labels, 



In [64]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

# Global device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.backends.mps.is_available():

    torch.device('mps')

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.fc2 = nn.Linear(hidden_size//2, output_size)
        self.relu = nn.ReLU()
        # self.bn = nn.BatchNorm1d(hidden_size//2)



    def forward(self, x):
        # Forward pass through LSTM layer
        # x shape: (batch, sequence_length, input_size)
        out, (h_t, c_t) = self.lstm(x)

        # We use the last hidden state to predict the output
        # h_t shape: (num_layers, batch, hidden_size)
        # Take the last layer's hidden state
        h_t_last_layer = h_t[-1]

        # Forward pass through the fully connected layer
        x = self.relu(self.fc1(h_t_last_layer))
        # x = self.bn(x)
        out = self.fc2(x)


        return out

def initialize_data_loader(X, y, batch_size):
    # Convert list of numpy arrays to a list of torch tensors
    X_tensors = [torch.tensor(sequence, dtype=torch.float32) for sequence in X]
    
    # Since there is no need to pad, we can assume all sequences are of equal length or
    # handling variable length isn't required. We'll directly stack them into a single tensor.
    # If sequences are of different lengths and you decide later to handle it, adjustments will be needed here.
    X_tensor = torch.stack(X_tensors)
    
    # Convert list of single values into a torch tensor
    y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)  # Reshape y to be a column vector
    
    # Create tensor dataset
    dataset = TensorDataset(X_tensor, y_tensor)
    
    # Create data loader
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def train_model(model, data_loader, criterion, optimizer, num_epochs, device):
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}\n")
        model.train()
        total_loss = 0

        num_samples = len(data_loader)

        for i, (x_batch, y_batch) in enumerate(data_loader):
            print(f"\rTraining: {int(100*i/num_samples)}%  ", end="")
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # if (epoch + 1) % 10 == 0:
        print(f'\nEpoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(data_loader):.4f}\n')

def test_model(model, data_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No gradients needed
        for x_batch, y_batch in data_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()

    print(f'Test Loss: {total_loss / len(data_loader):.4f}')



In [58]:
df = pd.read_csv(DATA_PATH, sep=";")

X_train, y_train, X_test, y_test = prepare_data_lstm(df)

  'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))
  test_df = pd.concat([test_df, group.iloc[-n_test:]])
  train_df = pd.concat([train_df, group.iloc[:-n_test]])
  train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
  train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
  test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
  test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))


In [65]:

# Hyperparameters
input_size = X_train[0].shape[1]
hidden_size = 64
num_layers =2
output_size = 1
learning_rate = 0.001
num_epochs = 20
batch_size = 64

# Model setup
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
model = model.to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Initialize DataLoader
data_loader = initialize_data_loader(X_train , y_train , batch_size)

# Train the model
train_model(model, data_loader, criterion, optimizer, num_epochs, device)





Epoch 1

Training: 99%  
Epoch [1/20], Loss: 0.9185


Epoch 2

Training: 99%  
Epoch [2/20], Loss: 0.8510


Epoch 3

Training: 99%  
Epoch [3/20], Loss: 0.8248


Epoch 4

Training: 99%  
Epoch [4/20], Loss: 0.8002


Epoch 5

Training: 99%  
Epoch [5/20], Loss: 0.7884


Epoch 6

Training: 99%  
Epoch [6/20], Loss: 0.7779


Epoch 7

Training: 99%  
Epoch [7/20], Loss: 0.7622


Epoch 8

Training: 99%  
Epoch [8/20], Loss: 0.7620


Epoch 9

Training: 99%  
Epoch [9/20], Loss: 0.7571


Epoch 10

Training: 99%  
Epoch [10/20], Loss: 0.7512


Epoch 11

Training: 99%  
Epoch [11/20], Loss: 0.7405


Epoch 12

Training: 99%  
Epoch [12/20], Loss: 0.7329


Epoch 13

Training: 99%  
Epoch [13/20], Loss: 0.7302


Epoch 14

Training: 99%  
Epoch [14/20], Loss: 0.7243


Epoch 15

Training: 99%  
Epoch [15/20], Loss: 0.7179


Epoch 16

Training: 99%  
Epoch [16/20], Loss: 0.7531


Epoch 17

Training: 99%  
Epoch [17/20], Loss: 0.7195


Epoch 18

Training: 99%  
Epoch [18/20], Loss: 0.7079


Epoch 19


In [66]:

# Initialize DataLoader for test data
test_data_loader = initialize_data_loader(X_test, y_test, batch_size)


test_model(model, data_loader, criterion, device)

Test Loss: 0.7803


In [67]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

X_train_reg = [x[-1] for x in X_train]

model.fit(X_train_reg, y_train)

from sklearn.metrics import mean_squared_error

X_test_reg = [x[-1] for x in X_test]

y_hat = model.predict(X_test_reg)

mean_squared_error(y_hat, y_test)