# Data Exploration

- interactive meaningful visualizations
- some descriptive metrics of the dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read data

In [None]:
# local path to the data
PATH = "/Users/flohmann/Documents/ETH/FS2024/datathon24/skylab_instagram_datathon_dataset.csv" 

X = pd.read_csv(PATH, sep=';')

## General properties

In [None]:
# count unique values in every column
X.nunique()

We can remove calculation_type and period since they do not provide any information

In [None]:
# some basic dataset statistics
nrow = len(X.index)
nbrand = X.business_entity_doing_business_as_name.nunique()
start_date = X.period_end_date.min()
end_date = X.period_end_date.max()

# largest and smallest compset groups
ngroup = X.compset_group.nunique()
min_group = X.groupby('compset_group').business_entity_doing_business_as_name.nunique().sort_values().iloc[-1]
max_group = X.groupby('compset_group').business_entity_doing_business_as_name.nunique().sort_values().iloc[0]

print(f'The dataset contains {nrow} observations from {nbrand} brands that were recorded in the period from {start_date} to {end_date}.')
print(f'Brands are grouped into {ngroup} main competitive sets that vary in size from {max_group} brands to {min_group} brands')

In [None]:
X.groupby('compset_group').business_entity_doing_business_as_name.nunique().sort_values(ascending=False).plot(kind='bar')
plt.ylabel('# brands')
plt.xlabel('group name')
plt.title('Main competitive group sizes')
plt.show()

## Distribution of data over time

Are the dates equally distributed for every brand?
This is relevant for applying LSTM models.

In [None]:
print(f"Recording stops on the same date for all brands : {X.groupby('business_entity_doing_business_as_name').period_end_date.max().unique()[0]}")

In [None]:
fig, ax = plt.subplots()

# plot distribution of recording start dates
X.groupby('business_entity_doing_business_as_name').period_end_date.min().sort_values().hist(xrot=90, ax=ax, bins = 20, grid=False)
tick_labs = ax.get_xticklabels()
ticks = ax.get_xticks()
ax.set_xticks(ticks[::5], tick_labs[::5])
ax.set_ylabel('# brands')
ax.set_title('Distribution of starting dates')

fig.set_figwidth(7)
fig.set_figheight(5)
fig.tight_layout()

print('Recording of different brands starts at different dates, but the majority starts on the same date')

In [None]:
print(f"{sum(X.groupby('business_entity_doing_business_as_name').period_end_date.min()>'2015-01-03')} brands start at a later date")

## Duplicate values

In [None]:
cols = [c for c in X.columns if c != 'compset_group' and c != 'compset']
n_duplicates = len(X)-len(X.drop_duplicates(subset=cols))

print(f'{n_duplicates} datapoints occurr as duplicates in multiple compsets or compset_groups')

In [None]:
X_uniq = X.drop_duplicates(subset=cols)

In [None]:
X.groupby('compset_group').compset.nunique().sum()

In [None]:
msk = X.groupby('compset').compset_group.nunique() > 1
X.groupby('compset').compset_group.unique()[msk]

In [None]:
X.groupby('compset').compset_group.nunique()[msk]

In [None]:
print(f'Columns with nan values are {X.columns[X.isna().sum(axis=0)>0].tolist()}')

## Investigate nan values

In [None]:
# columns without nans
X.columns[X.isna().sum(axis=0)==0].tolist()

All brands are either traded at the same stock exchange over the entire recording period or at no exchange at all

In [None]:
sum(X.groupby('business_entity_doing_business_as_name').primary_exchange_name.nunique()>1)

Are stock prices a viable metric for validation? We need to check whether all brands are registered at some stock exchange

In [None]:
print(f'{X[X.primary_exchange_name.isna()].business_entity_doing_business_as_name.nunique()}/{X.business_entity_doing_business_as_name.nunique()} brands have no stock exchange recorded')

Columns that do not contain any nans can potentially be used to construct a unique identifier. We use period_end_date and business_entity_doing_business_as_name

### Nans in numeric columns

Now we want to look at the the rows that contain missing values in the numeric columns to better understand where these occurr

In [None]:
X_uniq.iloc[:,-5:].isna().sum(axis=0)

In [None]:
msk_nanrows = X_uniq.iloc[:,-5:].isna().sum(axis = 1) > 0
msk_nanrows.sum()

In [None]:
X_uniq[msk_nanrows].business_entity_doing_business_as_name.nunique()

In [None]:
X_uniq[msk_nanrows].business_entity_doing_business_as_name.value_counts().hist(bins=100)
plt.xlabel('# nan values')
plt.ylabel('# brands')

Are there brands that always have nans for some attribute?

In [None]:
na_frac = X_uniq[['business_entity_doing_business_as_name', 'followers', 'pictures',
       'videos', 'comments', 'likes']].groupby('business_entity_doing_business_as_name').apply(lambda x: x.iloc[:,1:].isna().sum()/len(x))

In [None]:
na_frac[na_frac.max(axis=1) > 0.7]

### Country names
Some country names contain semicolons, we want to strip these extra characters

In [None]:
X.domicile_country_name.unique()