# Post impression regression

## Notebooks set-up

In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from scipy import stats

## 1. Data loading

In [2]:
# Load the post dataset
data_df = pd.read_csv('../data/posts.csv')

# Inspect
data_df.head()

In [3]:
data_df.info()

## 2. EDA
### 2.1. Feature distributions
#### 2.1.1. Categorical features

In [None]:
categorical_features = ['external_link', 'media', 'post_day']

fig, axs = plt.subplots(1, 3, figsize=(9, 3.5))

fig.suptitle('Categorical Features Distribution')

for i, categorical_feature in enumerate(categorical_features):

    axs[i].bar(
        data_df[categorical_feature].value_counts().index,
        data_df[categorical_feature].value_counts().values,
        color='black'
    )

    if categorical_feature != 'post_day':
        axs[i].set_xticks([0, 1])
        axs[i].set_xticklabels(['No', 'Yes'])

    if categorical_feature == 'post_day':
        axs[i].set_xticks(range(len(data_df[categorical_feature].value_counts())))
        axs[i].set_xticklabels(data_df[categorical_feature].value_counts().index, rotation=45)

    axs[i].set_title(categorical_feature)
    axs[i].set_ylabel('Counts')

fig.tight_layout()
fig.show()

#### 2.1.2. Numerical features

In [5]:
numerical_features = ['impressions', 'word_count', 'n_tags']

fig, axs = plt.subplots(1, 3, figsize=(9, 3))

fig.suptitle('Numerical Features Distribution')

for i, numerical_feature in enumerate(numerical_features):
    axs[i].hist(data_df[numerical_feature], bins=30, color='black')
    axs[i].set_title(numerical_feature)
    axs[i].set_xlabel(numerical_feature)
    axs[i].set_ylabel('Counts')

fig.tight_layout()
fig.show()

### 2.2. Feature interactions

#### 2.2.1. Categorical-categorical

In [None]:
categorical_feature_pairs = [
    ['external_link', 'media'],
    ['external_link', 'post_day'],
    ['media', 'post_day']
]

fig, axs = plt.subplots(1, 3, figsize=(12, 4))

fig.suptitle('Categorical feature contingency tables')

for i, feature_pair in enumerate(categorical_feature_pairs):

    contingency_table = pd.crosstab(data_df[feature_pair[0]], data_df[feature_pair[1]])

    if feature_pair[0] == 'post_day':
        contingency_table = contingency_table[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]

    chisquared_result = stats.chi2_contingency(contingency_table)

    axs[i].set_title(f'{feature_pair[0]} x {feature_pair[1]}\n$Chi^2$ p={chisquared_result[1]:.4f}')
    sns.heatmap(contingency_table, annot=True, fmt='d', cmap='Blues', ax=axs[i])

fig.tight_layout()
fig.show()

#### 2.2.2. Numerical-categorical