# Observations

- period and calculation_type features all the same value, can drop
- split time series into batches of shorter sequences
- how to impute intermediate missing values? -> take last? interpolate?
- add year, month, absolute int as features for date

- one hot encoding of categorical features: too many, not important enough... drop? keep as single numerical feature?

- duplicate datapoints, same brand, same date, same values - two different compsets...

# TODO:
- align the data by brand
- are there weeks missing in between?
- normalize values


# Discussion
- should we predict by brand or by legal entity?


- standardization in the end after adding additional metrics

In [89]:
import pandas as pd 

In [163]:
DATA_PATH = "/home/david/code/datathon24-personal/data/skylab_instagram_datathon_dataset.csv"

df = pd.read_csv(DATA_PATH, sep=";")

def prepare_data(df, sequence_length=10, prediction_dist=4, missing_values_cutoff=0.7, test_fraction=0.2):

    df = df.drop(columns=["period", "calculation_type", "compset", "compset_group", "legal_entity_name", "ultimate_parent_legal_entity_name", "primary_exchange_name"])
    df["period_end_date"] = pd.to_datetime(df["period_end_date"])

    df = df.rename(columns={'business_entity_doing_business_as_name': 'brand', 'period_end_date': 'date'})

    df = df[df['brand'] != "All Brands"]

    df = df.groupby(['brand', 'date']).first().reset_index()

    def correct_country_name(name):
        country_map = {
            "Hong Kong": "China",
            "China;Hong Kong": "China",
            ";France": "France",
            ";": None,
            "Belgium;": "Belgium"
        }
        if name in country_map.keys():
            return country_map[name]
        return name

    df['domicile_country_name'] = df['domicile_country_name'].apply(correct_country_name)


    categorical_features = ["domicile_country_name"]
    for feature in categorical_features:
        df = pd.get_dummies(df, columns=[feature], prefix=feature, dummy_na=True, dtype=int)

    # TODO: remove bad data
    na_frac = df[['brand', 'followers', 'pictures',
        'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

    bad_brands = list(na_frac[na_frac.max(axis=1) > missing_values_cutoff].index)

    df = df[~df['brand'].isin(bad_brands)]


    # TODO: add additional features
    df['engagement'] = df['comments'] + df['likes']
    df['engagement_rate'] = df['engagement']/df['followers']
    df['engagement_rate_per_post'] =  df['engagement_rate']/(df['videos'] + df['pictures'])


    # TODO: normalize values
    normalize_cols = ["followers", "pictures", "videos", "comments", "likes", "engagement", "engagement_rate", "engagement_rate_per_post"]
    for col in normalize_cols:
        df[col] = (df[col] - df[col].mean())/df[col].std()

    # TODO: impute missing values
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    

    # TODO: create sequences

    # TODO: split df into brands
    # sequence per brand
    # label

    label_col = "engagement_rate_per_post"

    train_sequences = []
    train_labels = []
    test_sequences = []
    test_labels = []


    for bi, brand in enumerate(df['brand'].unique()):
        branddf = df[df['brand'] == brand]
        n_test = int(len(branddf) * test_fraction)
        n_train = len(branddf) - n_test

        traindf = branddf[:n_train]

        cols = [c for c in traindf.columns if (c != "brand" and c != "date")]
        traindf = traindf[cols]

        for i in range(len(traindf) - (sequence_length + prediction_dist)):
            sequence = traindf.iloc[i:i + sequence_length].values
            train_sequences.append(sequence)

            sequence_labels = traindf.iloc[i+sequence_length+prediction_dist][label_col]
            train_labels.append(sequence_labels)
        

        testdf = branddf[n_train:]
        testdf = testdf[cols]

        

        for i in range(len(testdf) - (sequence_length + prediction_dist)):
            sequence = testdf.iloc[i:i + sequence_length].values
            test_sequences.append(sequence)

            sequence_labels = testdf.iloc[i+sequence_length+prediction_dist][label_col]
            test_labels.append(sequence_labels)


    return train_sequences, train_labels, test_sequences, test_labels

X_train, y_train, X_test, y_test = prepare_data(df)


  'videos', 'comments', 'likes']].groupby('brand').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))
  df = df.fillna(method='ffill')
  df = df.fillna(method='bfill')
