### 1. Import dependecies

In [1]:
import pandas as pd
import seaborn as sns

### 2. Loading Data

In [3]:
df = pd.read_csv('data/processed/Outliers_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low
4,Instagram,#Challenge,Shorts,Australia,1323566,136282,86979,47129,Low


### 3. Creating new features

#### 3.1 Engagement rates 

##### We calculate total_interaction_rate to measure how much viewers engage with a post relative to its reach.


In [9]:
df['Like_Rate'] = (df['Likes'] / df['Views'])
df['Share_Rate'] = (df['Shares'] / df['Views'])
df['Comment_Rate'] = (df['Comments'] / df['Views'])
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096
4,Instagram,#Challenge,Shorts,Australia,1323566,136282,86979,47129,Low,0.102966,0.065716,0.035608


In [10]:
df['Total_Engagement'] = df[['Likes', 'Shares', 'Comments']].sum(axis=1)
df['Engagement_Rate'] = (df['Total_Engagement'] / df['Views'])

# Total Engagement without Shares - useful for predicting Shares without data leakage
df['Total_Engagement_wo_Shares'] = df['Likes'] + df['Comments']
df['Engagement_Rate_wo_Shares'] = (df['Likes'] + df['Comments']) / df['Views']
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,Total_Engagement,Engagement_Rate,Total_Engagement_wo_Shares,Engagement_Rate_wo_Shares
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647,411912,0.098935,358777,0.086173
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554,308339,0.074192,242479,0.058345
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988,402789,0.109865,363366,0.099112
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096,175618,0.191315,163931,0.178584
4,Instagram,#Challenge,Shorts,Australia,1323566,136282,86979,47129,Low,0.102966,0.065716,0.035608,270390,0.204289,183411,0.138573


#### 3.2 Platform-Normalized Metrics

##### Normalize reach per platform, this accounts for the fact that, e.g., TikTok may have naturally higher views than Twitter.

In [11]:
# 'Views_norm' = Views / avg(Views_per_Platform)
df['Views_norm'] = df.groupby('Platform')['Views'].transform(lambda x: x / x.mean())
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,Total_Engagement,Engagement_Rate,Total_Engagement_wo_Shares,Engagement_Rate_wo_Shares,Views_norm
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647,411912,0.098935,358777,0.086173,1.57157
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554,308339,0.074192,242479,0.058345,1.622473
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988,402789,0.109865,363366,0.099112,1.383053
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096,175618,0.191315,163931,0.178584,0.347293
4,Instagram,#Challenge,Shorts,Australia,1323566,136282,86979,47129,Low,0.102966,0.065716,0.035608,270390,0.204289,183411,0.138573,0.516718


In [12]:
# saving handled data set
df.to_csv( 'data/processed/feature_engineered.csv', index=False)