# Observations

- period and calculation_type features all the same value, can drop
- split time series into batches of shorter sequences
- how to impute intermediate missing values? -> take last? interpolate?
- add year, month, absolute int as features for date

- one hot encoding of categorical features: too many, not important enough... drop? keep as single numerical feature?

- duplicate datapoints, same brand, same date, same values - two different compsets...

# TODO:
- align the data by brand
- are there weeks missing in between?
- normalize values


# Discussion
- should we predict by brand or by legal entity?


- standardization in the end after adding additional metrics

In [1]:
import pandas as pd 

In [171]:
df = pd.read_csv(DATA_PATH, sep=";")


In [174]:
df = df[df['business_entity_doing_business_as_name'] != 'All Brands']

In [175]:
df['followers'].max()

430176998.0

In [177]:
df[df['followers'] == 430176998.0]

Unnamed: 0,period,period_end_date,compset_group,compset,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,calculation_type,followers,pictures,videos,comments,likes
499780,Weekly,2023-09-16,Sportswear & Athleisure,Sportswear,Nike,Nike,United States of America,Nike,New York Stock Exchange,Metric Value,430176998.0,196.0,139.0,42514.0,9194724.0
499961,Weekly,2023-09-16,Sportswear & Athleisure,US Softlines Analyst Interest List,Nike,Nike,United States of America,Nike,New York Stock Exchange,Metric Value,430176998.0,196.0,139.0,42514.0,9194724.0
500042,Weekly,2023-09-16,Sportswear & Athleisure,Sportswear & Athleisure,Nike,Nike,United States of America,Nike,New York Stock Exchange,Metric Value,430176998.0,196.0,139.0,42514.0,9194724.0


In [179]:
df[df['business_entity_doing_business_as_name'] == 'Nine West']

Unnamed: 0,period,period_end_date,compset_group,compset,business_entity_doing_business_as_name,legal_entity_name,domicile_country_name,ultimate_parent_legal_entity_name,primary_exchange_name,calculation_type,followers,pictures,videos,comments,likes
68827,Weekly,2016-08-27,Luxury & Premium & Mainstream,Footwear,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,,282.0,11.0,3154.0,162063.0
68828,Weekly,2016-03-19,Luxury & Premium & Mainstream,Luxury & Premium & Mainstream,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,,278.0,5.0,4547.0,123483.0
68829,Weekly,2023-08-26,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1966375.0,315.0,34.0,4235.0,17044.0
68830,Weekly,2018-11-03,Luxury & Premium & Mainstream,Footwear,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1101163.0,307.0,24.0,5819.0,197415.0
68831,Weekly,2019-03-16,Luxury & Premium & Mainstream,Luxury & Premium & Mainstream,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1196891.0,382.0,23.0,5812.0,237443.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70187,Weekly,2023-07-29,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1963930.0,361.0,40.0,3148.0,21942.0
70188,Weekly,2023-06-24,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1961784.0,370.0,41.0,425.0,37852.0
70189,Weekly,2019-12-07,Luxury & Premium & Mainstream,US Softlines Analyst Interest List,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,1456414.0,505.0,40.0,4015.0,171113.0
70190,Weekly,2018-03-24,Luxury & Premium & Mainstream,Footwear,Nine West,Premier Brands,,Sycamore Partners,,Metric Value,955823.0,350.0,17.0,4481.0,174004.0


In [33]:
DATA_PATH = "/home/david/code/datathon24-personal/data/skylab_instagram_datathon_dataset.csv"

df = pd.read_csv(DATA_PATH, sep=";")


def preprocess_data(df, missing_values_cutoff=0.7):

    df = df.drop(columns=["period", "calculation_type", "compset", "compset_group", "legal_entity_name", "ultimate_parent_legal_entity_name", "primary_exchange_name"])
    df["period_end_date"] = pd.to_datetime(df["period_end_date"])

    df = df.rename(columns={'business_entity_doing_business_as_name': 'brand', 'period_end_date': 'date'})

    df = df[df['brand'] != "All Brands"]
    df = df[df['brand'] != "Boca"]

    df = df.groupby(['brand', 'date']).first().reset_index()

    def correct_country_name(name):
        country_map = {
            "Hong Kong": "China",
            "China;Hong Kong": "China",
            ";France": "France",
            ";": None,
            "Belgium;": "Belgium"
        }
        if name in country_map.keys():
            return country_map[name]
        return name

    df['domicile_country_name'] = df['domicile_country_name'].apply(correct_country_name)


    categorical_features = ["domicile_country_name"]
    for feature in categorical_features:
        df = pd.get_dummies(df, columns=[feature], prefix=feature, dummy_na=True, dtype=int)

    # TODO: remove bad data
    na_frac = df[['brand', 'followers', 'pictures',
        'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

    bad_brands = list(na_frac[na_frac.max(axis=1) > missing_values_cutoff].index)

    df = df[~df['brand'].isin(bad_brands)]


    # TODO: add additional features
    df['engagement'] = df['comments'] + df['likes']
    df['engagement_rate'] = df['engagement']/df['followers']
    df['engagement_rate_per_post'] =  df['engagement_rate']/(df['videos'] + df['pictures'])



    # TODO: normalize values
    normalize_cols = ["followers", "pictures", "videos", "comments", "likes", "engagement", "engagement_rate", "engagement_rate_per_post"]

    for col in normalize_cols:
        m = df[col].mean()
        s = df[col].std()
        df[col] = (df[col] - m)/s

    # TODO: impute missing values
  
    # TODO: impute missing values
    df = df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    df = df.reset_index(drop=True)

    df = df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    df = df.reset_index(drop=True)


    return df


def prepare_data_lstm(df, sequence_length=10, prediction_dist=4, missing_values_cutoff=0.7, test_fraction=0.2):

    df = df.drop(columns=["period", "calculation_type", "compset", "compset_group", "legal_entity_name", "ultimate_parent_legal_entity_name", "primary_exchange_name"])
    df["period_end_date"] = pd.to_datetime(df["period_end_date"])

    df = df.rename(columns={'business_entity_doing_business_as_name': 'brand', 'period_end_date': 'date'})

    df = df[df['brand'] != "All Brands"]
    df = df[df['brand'] != "Boca"]

    df = df.groupby(['brand', 'date']).first().reset_index()

    def correct_country_name(name):
        country_map = {
            "Hong Kong": "China",
            "China;Hong Kong": "China",
            ";France": "France",
            ";": None,
            "Belgium;": "Belgium"
        }
        if name in country_map.keys():
            return country_map[name]
        return name

    df['domicile_country_name'] = df['domicile_country_name'].apply(correct_country_name)


    categorical_features = ["domicile_country_name"]
    for feature in categorical_features:
        df = pd.get_dummies(df, columns=[feature], prefix=feature, dummy_na=True, dtype=int)

    # TODO: remove bad data
    na_frac = df[['brand', 'followers', 'pictures',
        'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

    bad_brands = list(na_frac[na_frac.max(axis=1) > missing_values_cutoff].index)

    df = df[~df['brand'].isin(bad_brands)]


    # TODO: add additional features
    df['engagement'] = df['comments'] + df['likes']
    df['engagement_rate'] = df['engagement']/df['followers']
    df['engagement_rate_per_post'] =  df['engagement_rate']/(df['videos'] + df['pictures'])



    # Step 1: Sort the dataframe by time
    df_sorted = df.sort_values(by='date')

    # Step 2: Group the dataframe by 'Brand'
    grouped = df_sorted.groupby('brand')

    # Step 3: Define an empty dataframe for train and test sets
    train_df = pd.DataFrame(columns=df.columns)  # Columns same as original dataframe
    test_df = pd.DataFrame(columns=df.columns)   # Columns same as original dataframe

    # Step 4: Iterate over each group and split into train and test sets
    for _, group in grouped:
        n_rows = len(group)
        n_test = int(test_fraction * n_rows)  # 20% of rows for test set

        # Add last 20% of rows to test set
        test_df = pd.concat([test_df, group.iloc[-n_test:]])

        # Add remaining rows to train set
        train_df = pd.concat([train_df, group.iloc[:-n_test]])

    # Step 5: Reset index for both train and test dataframes
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)


    # TODO: normalize values
    normalize_cols = ["followers", "pictures", "videos", "comments", "likes", "engagement", "engagement_rate", "engagement_rate_per_post"]

    for col in normalize_cols:
        m = train_df[col].mean()
        s = train_df[col].std()
        train_df[col] = (train_df[col] - m)/s
        test_df[col] = (test_df[col] - m)/s


    # TODO: impute missing values
  
    # TODO: impute missing values
    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    train_df = train_df.reset_index(drop=True)

    train_df = train_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    train_df = train_df.reset_index(drop=True)


    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
    test_df = test_df.reset_index(drop=True)

    test_df = test_df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
    test_df = test_df.reset_index(drop=True)


    # TODO: create sequences

    # TODO: split df into brands
    # sequence per brand
    # label

    label_col = "engagement_rate_per_post"

    train_sequences = []
    train_labels = []
    test_sequences = []
    test_labels = []


    for bi, brand in enumerate(df['brand'].unique()):

        brand_train_df = train_df[train_df['brand'] == brand]

        cols = [c for c in train_df.columns if (c != "brand" and c != "date")]
        brand_train_df = brand_train_df[cols]

        for i in range(len(brand_train_df) - (sequence_length + prediction_dist)):
            sequence = brand_train_df.iloc[i:i + sequence_length].values
            train_sequences.append(sequence)

            sequence_labels = brand_train_df.iloc[i+sequence_length+prediction_dist][label_col]
            train_labels.append(sequence_labels)
        
        brand_test_df = test_df[test_df['brand'] == brand]

        brand_test_df = brand_test_df[cols]


        for i in range(len(brand_test_df) - (sequence_length + prediction_dist)):
            sequence = brand_test_df.iloc[i:i + sequence_length].values
            test_sequences.append(sequence)

            sequence_labels = brand_test_df.iloc[i+sequence_length+prediction_dist][label_col]
            test_labels.append(sequence_labels)


    return train_sequences, train_labels, test_sequences, test_labels

# X_train, y_train, X_test, y_test = prepare_data_lstm(df)


In [34]:
preprocess_data(df)

  'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))
  df = df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  df = df.groupby('brand').apply(lambda group: group.fillna(method='ffill'))
  df = df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))
  df = df.groupby('brand').apply(lambda group: group.fillna(method='bfill'))


Unnamed: 0,brand,date,followers,pictures,videos,comments,likes,domicile_country_name_Australia,domicile_country_name_Belgium,domicile_country_name_Brazil,...,domicile_country_name_Singapore,domicile_country_name_Spain,domicile_country_name_Sweden,domicile_country_name_Switzerland,domicile_country_name_United Kingdom of Great Britain and Northern Ireland,domicile_country_name_United States of America,domicile_country_name_nan,engagement,engagement_rate,engagement_rate_per_post
0,24S,2017-05-06,-0.286451,-0.429305,-0.302278,-0.177231,-0.251962,0,0,0,...,0,0,0,0,0,0,0,-0.253701,2.843436,0.996963
1,24S,2017-05-13,-0.286451,-0.429305,-0.302278,-0.177231,-0.251962,0,0,0,...,0,0,0,0,0,0,0,-0.253701,2.843436,0.996963
2,24S,2017-05-20,-0.286451,-0.429305,-0.302278,-0.177231,-0.251962,0,0,0,...,0,0,0,0,0,0,0,-0.253701,2.843436,0.996963
3,24S,2017-05-27,-0.286451,-0.429305,-0.302278,-0.177231,-0.251962,0,0,0,...,0,0,0,0,0,0,0,-0.253701,2.843436,0.996963
4,24S,2017-06-03,-0.286451,-0.362194,-0.302278,-0.176280,-0.251120,0,0,0,...,0,0,0,0,0,0,0,-0.252847,2.843436,0.996963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289787,shopDisney,2023-08-19,0.029807,0.540083,0.369886,-0.121598,-0.148823,0,0,0,...,0,0,0,0,0,1,0,-0.150272,-0.450343,-0.576640
289788,shopDisney,2023-08-26,0.030007,0.536355,0.493344,-0.114374,-0.129334,0,0,0,...,0,0,0,0,0,1,0,-0.130798,-0.408335,-0.572330
289789,shopDisney,2023-09-02,0.030318,0.547540,0.493344,-0.106583,-0.119321,0,0,0,...,0,0,0,0,0,1,0,-0.120706,-0.386775,-0.570085
289790,shopDisney,2023-09-09,0.030706,0.513984,0.424756,-0.110607,-0.121882,0,0,0,...,0,0,0,0,0,1,0,-0.123329,-0.392793,-0.569314
