### 1. Import dependecies

In [1]:
import pandas as pd
import seaborn as sns

### 2. Loading Data

In [2]:
df = pd.read_csv('data/processed/Missing_Values_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low
4,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium


### 3. Creating new features

#### 3.1 Engagement rates 

##### We calculate total_interaction_rate to measure how much viewers engage with a post relative to its reach.


In [3]:
df['Like_Rate'] = (df['Likes'] / df['Views'])
df['Share_Rate'] = (df['Shares'] / df['Views'])
df['Comment_Rate'] = (df['Comments'] / df['Views'])
df['engagement_rate'] = (df['Likes'] + df['Shares'] + df['Comments']) / df['Views']
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,engagement_rate
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647,0.098935
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554,0.074192
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988,0.109865
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096,0.191315
4,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium,2.641769,1.072688,0.098295,3.812752


#### 3.2 Interaction Ratios (Relationships Between Actions)

##### These describe relative audience behavior and help the model understand what type of engagement dominates.


In [4]:
#'like_to_comment_ratio' = Likes / (Comments + 1)
#'share_to_like_ratio'   = Shares / (Likes + 1)
df['like_to_comment_ratio'] = (df['Likes'] / (df['Comments'] + 1))
df['share_to_like_ratio'] = (df['Shares'] / (df['Likes'] + 1))
df.head()


Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,engagement_rate,like_to_comment_ratio,share_to_like_ratio
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647,0.098935,17.544374,0.156541
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554,0.074192,7.901615,0.305983
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988,0.109865,9.031112,0.120507
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096,0.191315,3.453827,0.091932
4,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium,2.641769,1.072688,0.098295,3.812752,26.871727,0.406047


#### 3.3 Platform-Normalized Metrics

##### Normalize reach per platform, this accounts for the fact that, e.g., TikTok may have naturally higher views than Twitter.

In [5]:
# 'Views_norm' = Views / avg(Views_per_Platform)
df['Views_norm'] = df.groupby('Platform')['Views'].transform(lambda x: x / x.mean())
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,engagement_rate,like_to_comment_ratio,share_to_like_ratio,Views_norm
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High,0.081526,0.012762,0.004647,0.098935,17.544374,0.156541,1.655443
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium,0.051791,0.015847,0.006554,0.074192,7.901615,0.305983,1.728703
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium,0.089232,0.010753,0.00988,0.109865,9.031112,0.120507,1.462971
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low,0.138488,0.012732,0.040096,0.191315,3.453827,0.091932,0.360596
4,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium,2.641769,1.072688,0.098295,3.812752,26.871727,0.406047,0.025791


In [6]:
# saving handled data set
df.to_csv( 'data/processed/feature_engineered.csv', index=False)