# Hack The Feed: Insights From Social Media Data

### 🎯 Project Brief

Playhouse Communication is one of Nigeria's leading digital marketing agencies. They combine design and media planning with cutting-edge tech solutions to reimagine what marketing is all about. Their client roster is a mix of global juggernauts and nimble SMEs, each redefining their sectors.

We are rolling out the ultimate arena for innovation in data and setting the stage for up and coming data scientists and analysts to showcase their skills, win huge cash prizes, and boost their careers. The "Hack the Feed" hackathon is a showdown where data analytics meets creative prowess.

Your mission? To decode a treasure trove of social media data for one of our high-profile clients and transform it into game-changing insights.

In a rare move, we're handing you the keys to a vault of exclusive social media data to let you dig deep, get creative, and strike gold with actionable insights that could redefine the future of digital marketing.  This isn't just a hackathon; it's your chance to shape the future of digital engagement. 🚀

Key Deliverables:
Participants are expected to:

    Create a comprehensive and reproducible report detailing their findings.
    Propose actionable recommendations based on the insights.
    Create a simple and engaging visualisation of your results & analysis.


Evaluation Criteria:
Submissions will be evaluated based on the following:

    Innovativeness:
        Originality and novelty of the insights.
    Actionability:
        Practicality and feasibility of the recommendations.
    Presentation Quality:
        Clarity and effectiveness in conveying findings in writing and visual form.
    Depth of Analysis:
        How thoroughly the data has been explored and understood.

In [None]:
import os
import gc
import shutil
import copy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import random
from collections import defaultdict
from typing import Union, List, Literal, Dict, Callable, Tuple, Optional
from tqdm import tqdm

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, KFold

In [None]:
import nltk
from IPython.display import display
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from collections import Counter
nltk.download('stopwords')
nltk.download('vader_lexicon')

import string
import re

In [None]:
data_dir = '/content/drive/MyDrive/Hack The Feed Hackathon'
os.chdir(data_dir)

In [None]:
fb_data = pd.read_csv('Post Performance (Stanbic IBTC) January 1, 2013 - July 13, 2023_Facebook.csv', low_memory=False)
x_data = pd.read_csv('Post Performance (Stanbic IBTC) January 1, 2013 - July 13, 2023_Twitter.csv', low_memory=False)
ig_data = pd.read_csv('Post Performance (Stanbic IBTC) January 1, 2013 - July 13, 2023_Instagram.csv', low_memory=False)
ld_data = pd.read_csv('Post Performance (Stanbic IBTC) January 1, 2013 - July 13, 2023_LinkedIn.csv', low_memory=False)

## Twitter Data

In [None]:
x_data.info()

In [None]:
x_cols = x_data.columns

cols_missing = x_data[x_cols].isnull().sum().to_dict()
for col in cols_missing:
  if cols_missing[col] > 0.5 * x_data.shape[0]:
    x_cols=x_cols.drop(col)

In [None]:
cleaned_data = x_data[x_cols]

In [None]:
low_cardinality_columns = []
high_cardinality_columns = []
medium_cardinality_columns = []
cat_cols = cleaned_data.select_dtypes(include=['object']).columns

for col in cat_cols:
  num_uniq = cleaned_data[col].nunique()
  if num_uniq < 5:
    low_cardinality_columns.append(col)
  elif num_uniq > 20:
    high_cardinality_columns.append(col)
  else:
    medium_cardinality_columns.append(col)

In [None]:
cleaned_data[low_cardinality_columns].head(3)

 ### HIGH CARDINALITY COLUMNS

In [None]:
cleaned_data[high_cardinality_columns[:5]].head(5)

In [None]:
impression_reach_cols = high_cardinality_columns[4:7]
cleaned_data[impression_reach_cols].head(5)

In [None]:
cleaned_data[impression_reach_cols].isnull().sum()

In [None]:
cleaned_impressions_reach = cleaned_data

for col in cleaned_impressions_reach[impression_reach_cols].columns:
  cleaned_impressions_reach = cleaned_impressions_reach[cleaned_impressions_reach[col].notna()].reset_index(drop=True)

In [None]:
cleaned_impressions_reach[impression_reach_cols] = cleaned_impressions_reach[impression_reach_cols].apply(
    lambda row:row.str.replace(',','').astype("int"), axis=1)

In [None]:
def get_time_period(hour):
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

def get_quarter(month):
    if 1 <= month <= 3:
        return 'Q1'
    elif 4 <= month <= 6:
        return 'Q2'
    elif 7 <= month <= 9:
        return 'Q3'
    else:
        return 'Q4'

def get_season(month):
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Autumn'
    else:
        return 'Winter'

def get_day_period(day_of_week):
    if 0 <= day_of_week <= 4:  # Monday to Friday
        return 'Weekday'
    else:  # Saturday and Sunday
        return 'Weekend'

In [None]:
holidays = pd.read_excel('NigerianHolidays.xlsx')
holidays['month-day'] = pd.to_datetime(holidays.Date).dt.strftime('%m-%d')

In [None]:
holidays = holidays.set_index('month-day')
holiday_names = holidays['Name'].to_dict()
holiday_types = holidays['Type'].to_dict()

In [None]:
cleaned_impressions_reach['year'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.year
cleaned_impressions_reach['month_name'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.month_name()
cleaned_impressions_reach['month'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.month
cleaned_impressions_reach['day_name'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.day_name()
cleaned_impressions_reach['day'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.day
cleaned_impressions_reach['hour'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.hour
cleaned_impressions_reach['minute'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.minute
cleaned_impressions_reach['day_of_week'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.dayofweek
cleaned_impressions_reach['month-day'] = pd.to_datetime(cleaned_impressions_reach.Date).dt.strftime('%m-%d')

In [None]:
cleaned_impressions_reach['time_period'] = cleaned_impressions_reach['hour'].apply(get_time_period)
cleaned_impressions_reach['quarter'] = cleaned_impressions_reach['month'].apply(get_quarter)
cleaned_impressions_reach['season'] = cleaned_impressions_reach['month'].apply(get_season)
cleaned_impressions_reach['day_period'] = cleaned_impressions_reach['day_of_week'].apply(get_day_period)
cleaned_impressions_reach['holiday_names'] = cleaned_impressions_reach['month-day'].map(holiday_names).fillna("Regular Day")
cleaned_impressions_reach['holiday_types'] = cleaned_impressions_reach['month-day'].map(holiday_types).fillna("Regular Type")

In [None]:
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday']
cleaned_impressions_reach['day_name'] = pd.Categorical(cleaned_impressions_reach['day_name'], categories=order, ordered=True)



order = ['Morning', 'Afternoon', 'Evening', 'Night']
cleaned_impressions_reach['time_period'] = pd.Categorical(cleaned_impressions_reach['time_period'], categories=order, ordered=True)

In [None]:
plt.style.use('seaborn-v0_8-darkgrid')

fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20,15))
axs = axs.ravel()

plot_cols = ['year','time_period','day_name','day_period','quarter','holiday_types']

for i, col in enumerate(plot_cols):
  df = cleaned_impressions_reach[cleaned_impressions_reach[col]!='Regular Type']
  df_yearly = df.groupby(col)[['Post']].count()  # Sum column values for each year
  axs[i].plot(df_yearly.index, df_yearly.values, marker='o')

  axs[i].set_xlabel(col)
  axs[i].set_ylabel('Count')
  axs[i].set_title(f'Trend of Posts Across {col}')

plt.show()

In [None]:
cleaned_impressions_reach[impression_reach_cols].describe().T

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
ss = StandardScaler()
mm = MinMaxScaler()

In [None]:
def plot_bars(df, plot_cols, cols, scale=True, barWidth=0.25, length=20, width=15):
  top_df = df.copy()
  if scale:
    top_df.loc[:, cols] = mm.fit_transform(df[cols])

  fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(length, width))
  axs = axs.ravel()
  colors = ['red','green','blue','yellow','skyblue', 'olive', 'gold', 'purple', 'orange', 'brown']

  for i, column in enumerate(plot_cols):
    df = top_df.groupby([column])[cols].sum()
    barWidth = barWidth
    r = [np.arange(len(df)) + i*barWidth for i in range(len(cols))]
    for j, col in enumerate(cols):
        axs[i].bar(r[j], df[col], color=colors[j], width=barWidth, edgecolor='grey', label=col)

    axs[i].set_xlabel(column, fontweight='bold')
    axs[i].set_xticks([r + barWidth for r in range(len(df))], df.index)
    axs[i].legend()

  plt.show()



In [None]:
def plot_bars_quantile(df, plot_cols, cols, quantile=0.99, scale=True, barWidth=0.25, length=20, width=15):
  last_quantile = df[cols[-1]].quantile(quantile)
  top_df = df[df[cols[-1]] > last_quantile].copy()
  if scale:
    top_df.loc[:, cols] = mm.fit_transform(top_df[cols])

  fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(length, width))
  axs = axs.ravel()
  colors = ['red','green','blue','yellow','skyblue', 'olive', 'gold', 'purple', 'orange', 'brown']

  for i, column in enumerate(plot_cols):
    df = top_df.groupby([column])[cols].sum().sort_values(by=[cols[-1]], ascending=False)
    barWidth = barWidth
    r = [np.arange(len(df)) + i*barWidth for i in range(len(cols))]
    for j, col in enumerate(cols):
        axs[i].bar(r[j], df[col], color=colors[j], width=barWidth, edgecolor='grey', label=col)

    axs[i].set_xlabel(column, fontweight='bold')
    axs[i].set_xticks([r + barWidth for r in range(len(df))], df.index)
    axs[i].legend()

  plt.show()



In [None]:
# Call the function
plot_cols = ['year','time_period','day_name','day_period','quarter','holiday_types']
plot_bars_quantile(cleaned_impressions_reach, plot_cols, impression_reach_cols, quantile=0.99)

In [None]:
engage_rls_cols = high_cardinality_columns[8:12]
cleaned_impressions_reach[engage_rls_cols] = cleaned_impressions_reach[engage_rls_cols].apply(
    lambda row:row.str.replace(',','').astype("int"), axis=1)

In [None]:
cleaned_impressions_reach[engage_rls_cols].head(5)

In [None]:
cleaned_impressions_reach['Engagement Rate (per Impression)'] = (cleaned_impressions_reach['Engagements'] / cleaned_impressions_reach['Impressions']) * 100

In [None]:
cleaned_impressions_reach[engage_rls_cols].describe().T

In [None]:
plot_bars_quantile(cleaned_impressions_reach, plot_cols, engage_rls_cols[::-1], quantile=0.99, scale=False, barWidth=0.15)

In [None]:
click_cols = high_cardinality_columns[12:16]
cleaned_impressions_reach[click_cols].head(5)

In [None]:
cleaned_impressions_reach[click_cols] = cleaned_impressions_reach[click_cols].fillna("0")
cleaned_impressions_reach[click_cols] = cleaned_impressions_reach[click_cols].apply(
    lambda row:row.str.replace(',','').astype("int"), axis=1)

In [None]:
cleaned_impressions_reach[click_cols].describe().T

In [None]:
click_cols = ['Post Detail Expand Clicks','Post Media Clicks', 'Other Post Clicks','Post Clicks (All)']

In [None]:
plot_bars_quantile(cleaned_impressions_reach, plot_cols, click_cols, quantile=0.99, scale=False, barWidth=0.15)

#### WHOLE DATASET

In [None]:
plot_bars(cleaned_impressions_reach, plot_cols, impression_reach_cols)

In [None]:
plot_bars(cleaned_impressions_reach, plot_cols, engage_rls_cols, scale=False, barWidth=0.15)

In [None]:
plot_bars(cleaned_impressions_reach, plot_cols, click_cols, scale=False, barWidth=0.15)

In [None]:
def heatmap(df, length=10, width=5, cmap='rocket'):
  heatmap_df = df.corr(numeric_only=True)
  fig, ax = plt.subplots(figsize=(length, width))
  sns.heatmap(data=heatmap_df, annot=True, cmap=cmap, ax=ax)
  # ax.set_title(f'Heatmap of {" and ".join(list(df.columns))}')
  plt.show()


In [None]:
corr_matrix = cleaned_impressions_reach[high_cardinality_columns[4:16]]
heatmap(corr_matrix, length=15, width=10)

In [None]:
def time_series_analysis(df, metrics, duration, fill=False, length=15, width=10):
  df = df.copy()
  df['Date'] = pd.to_datetime(df['Date'])
  df.set_index('Date', inplace=True)

  fig, ax = plt.subplots(figsize=(length,width),dpi=120)

  for metric in metrics:
    duration_mean = df[metric].resample(duration).mean()
    duration_mean.plot(ax=ax, label=metric)
    if fill:
      ax.fill_between(duration_mean.index, y1=duration_mean.values, y2=-duration_mean.values, alpha=0.5, linewidth=2, color='seagreen')


  ax.set_xlabel('Time')
  ax.set_ylabel('Value')
  ax.set_title('Trend Analysis')

  ax.legend()
  if fill:
    ax.hlines(y=0, xmin=np.min(df.index), xmax=np.max(df.index), linewidth=.5)

  plt.show()

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Impressions","Organic Impressions"],
                     duration='Y',length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Potential Reach"],
                     duration='Y',length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     engage_rls_cols,
                     duration='Y',length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     click_cols,
                     duration='Y',length=12,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Impressions","Organic Impressions"],
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Potential Reach"],
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     engage_rls_cols,
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     click_cols,
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Impressions","Organic Impressions"],
                     duration='7D', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ["Potential Reach"],
                     duration='7D', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     engage_rls_cols,
                     duration='7D', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     click_cols,
                     duration='7D', fill=True,length=16,width=5)

In [None]:
medium_cardinality_columns

In [None]:
cleaned_impressions_reach['Sent by'] = np.where(
    cleaned_impressions_reach['Sent by']==' ', cleaned_impressions_reach['Sent by'].replace(
        ' ','Unknown'), cleaned_impressions_reach['Sent by'])

In [None]:
df_grouped = cleaned_impressions_reach.groupby('Sent by')[['Post']].count().sort_values(by=['Post'],ascending=False)
df_grouped

In [None]:
df_grouped = cleaned_impressions_reach.groupby('Sent by')[engage_rls_cols].sum().sort_values(by=['Engagements'],ascending=False)
df_grouped

### Numerical Columns

In [None]:
num_cols = list(cleaned_data.select_dtypes(include=['float64']).columns)
num_cols

In [None]:
cleaned_impressions_reach[num_cols].head(5)

In [None]:
corr_matrix = cleaned_impressions_reach[impression_reach_cols[:2]+engage_rls_cols[:3]+click_cols+num_cols[:4]+num_cols[6:]]
heatmap(corr_matrix,length=25, width=20)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ['Comments'],
                     duration='Y', fill=False,length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ['Comments'],
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     ['Comments'],
                     duration='7D', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[1:3],
                     duration='Y', fill=False,length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[1:3],
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[1:3],
                     duration='7D', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[6:],
                     duration='Y', fill=False,length=8,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[6:],
                     duration='M', fill=True,length=16,width=5)

In [None]:
time_series_analysis(cleaned_impressions_reach,
                     num_cols[6:],
                     duration='7D', fill=True,length=16,width=5)

### Posts

In [None]:
data = cleaned_impressions_reach.copy()
data[['Post']].head(5)

#### Preprocessing

In [None]:
remove_url=lambda x:re.sub(r'http\S+','',str(x))
to_lower=lambda x: x.lower()
remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))

more_words=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him",
            "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
            "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
            "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
            "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then",
            "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some",
            "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don",
            "should", "now"]

stop_words=set(stopwords.words('english')) #nltk package
stop_words.update(more_words)

remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) #.join is from package string

def preprocess_text(texts):
  texts = texts.apply(remove_url)
  texts = texts.apply(to_lower)
  texts = texts.apply(remove_puncs)
  texts = texts.apply(remove_words)
  return texts

def clean_text(text):
    '''remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# function to remove emoticons, symbols or flags by their codes
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [None]:
cleaned_posts = preprocess_text(data['Post'])
cleaned_posts = cleaned_posts.apply(lambda x: clean_text(x))
cleaned_posts = cleaned_posts.apply(lambda x: remove_emoji(x))

In [None]:
filtered_data = data.copy()
filtered_data['Post'] = cleaned_posts

In [None]:
words_list=[word for line in cleaned_posts for word in line.split()]

# creating dataframe and bar graph of most common 50 words with their frequency
word_counts=Counter(words_list).most_common(50)
word_df=pd.DataFrame(word_counts)
word_df.columns=['word','frq']
display(word_df.head(5))

fig = plt.figure(figsize = (15, 7))

# creating the bar plot
plt.bar(word_df['word'],word_df['frq'])
plt.xticks(rotation=90)
plt.xlabel('word')
plt.ylabel('frq')
plt.title('Most common words')
plt.show()

We can see the Top 50 words with the most frequency in our dataset, words such as itcanbe which is a hashtag, get, stanbic, ibtc, us, visit, email, call, click and so on

In [None]:
from wordcloud import WordCloud

In [None]:
cut_text = " ".join(filtered_data['Post'])
max_words=100
word_cloud = WordCloud(
                    background_color='white',
                    stopwords=set(stop_words),
                    max_words=max_words,
                    max_font_size=30,
                    scale=5,
                    colormap='magma',
                    random_state=1).generate(cut_text)
fig = plt.figure(1, figsize=(10,10))
plt.axis('off')
plt.title('Word Cloud for Top '+str(max_words)+' words from Twitter Posts\n', fontsize=10,color='blue')
fig.subplots_adjust(top=2.3)
plt.imshow(word_cloud)
plt.show()

Annotations/Sentiment AnalysiS

In [None]:
sid=SentimentIntensityAnalyzer()
ps=lambda x:sid.polarity_scores(x)
sentiment_scores=filtered_data['Post'].apply(ps)

In [None]:
# create the data frame of negative, neutral, positive and compound polarity scores
sentiment_df=pd.DataFrame(data=list(sentiment_scores))
labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
sentiment_df['sentiment_label']=sentiment_df.compound.apply(labelize)

filtered_data = filtered_data.join(sentiment_df['sentiment_label'])

In [None]:
filtered_data['sentiment_label'].value_counts().plot(kind='barh',title="Bar Plot of Posts Sentiments");

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import scipy.stats as stats

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

In [None]:
def topic_modelling(text, n=10):

  cvectorizer = CountVectorizer(max_df=0.95, min_df=2,stop_words='english',decode_error='ignore')
  lda_model = LatentDirichletAllocation(n_components=8,learning_method='online',max_iter=20,random_state=42)
  cvz = cvectorizer.fit_transform(text)
  X_topics = lda_model.fit_transform(cvz)
  n_top_words = n
  topic_summaries = []

  topic_word = lda_model.components_  # get the topic words
  vocab = cvectorizer.get_feature_names_out()

  for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' | '.join(topic_words)))

  return cvectorizer, lda_model, topic_summaries

In [None]:
## Get LDA Topics

vectorizer, lda_model, topic_summaries = topic_modelling(filtered_data['Post'], n=15)



- Topic 0: **Promotions and Contests**: This topic seems to be about various offers and incentives that the bank provides to its customers, such as winning prizes, watching sessions, and using easewallet.
- Topic 1: **Nigeria and Africa**: This topic seems to be about the bank's presence and impact in Nigeria and Africa, as well as its involvement in the local economy, industry, and society.
- Topic 2: **Email and Investment**: This topic seems to be about the bank's email communication and investment services, such as sending details, visiting websites, and providing education and insurance.
- Topic 3: **App and Mobile Banking**: This topic seems to be about the bank's app and mobile banking features, such as downloading, moving forward, and opening accounts.
- Topic 4: **Live Events and Online Discussions**: This topic seems to be about the bank's participation and hosting of live events and online discussions, such as smwlagos, motivationmonday, and business series.
- Topic 5: **Pension and Savings**: This topic seems to be about the bank's pension and savings products, such as helping customers make their dreams come true, dialing for airtime, and visiting for more information.
- Topic 6: **Card and Women Empowerment**: This topic seems to be about the bank's card services and women empowerment initiatives, such as paying with cards, emailing for details, and supporting women's development.
- Topic 7: **Registration and Social Media**: This topic seems to be about the bank's registration process and social media presence, such as registering for events, saving for needs, and sharing top reads.

In [None]:
topic_labels = ['Promotions and Contests','Nigeria and Africa','Email and Investment','App and Mobile Banking',
                'Live Events and Online Discussions','Pension and Savings','Card and Women Empowerment','Registration and Social Media']

def assign_topic_to_text(text, cvectorizer, lda_model, topic_labels):
    transformed_text = cvectorizer.transform([text])
    topic_distribution = lda_model.transform(transformed_text)
    best_topic = np.argmax(topic_distribution[0])

    return topic_labels[best_topic]

filtered_data['topic_label'] = filtered_data['Post'].apply(
    lambda x: assign_topic_to_text(x, vectorizer, lda_model, topic_labels))

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def named_entity_recognition(text):
  doc = nlp(text)
  label = [(X.label_) for X in doc.ents]
  return label

def part_of_speech_tagging(text):
  doc = nlp(text)
  label = [(X.pos_) for X in doc]
  return label

filtered_data['ner_label'] = filtered_data['Post'].apply(lambda x:named_entity_recognition(x))
filtered_data['pos_label'] = filtered_data['Post'].apply(lambda x:part_of_speech_tagging(x))

In [None]:
filtered_data['ner_count'] = filtered_data['ner_label'].apply(lambda x:len(x))
filtered_data['pos_count'] = filtered_data['pos_label'].apply(lambda x:len(x))

In [None]:
ner_tags = ['CARDINAL','DATE','GPE','LOC','MONEY','ORDINAL','ORG','PERSON','TIME']
for tag in ner_tags:
  filtered_data[tag + '_count'] = filtered_data['ner_label'].apply(lambda x: x.count(tag))

In [None]:
pos_tags = ['VERB', 'ADV', 'ADJ', 'NUM', 'NOUN', 'SPACE', 'PROPN']
for tag in pos_tags:
    filtered_data[tag + '_count'] = filtered_data['pos_label'].apply(lambda x: x.count(tag))

### Let's Delve in for Deeper Analysis

Let's see the distribution of words for each year

In [None]:
def get_top_n_words(cleaned_posts, ax, n:int=20):
    words_list=[word for line in cleaned_posts for word in line.split()]
    word_counts=Counter(words_list).most_common(n)
    word_df=pd.DataFrame(word_counts)
    word_df.columns=['word','frq']

    ax.bar(word_df['word'],word_df['frq'])
    ax.set_xticklabels(word_df['word'], rotation=90)
    ax.set_xlabel('word')
    ax.set_ylabel('frq')

def word_cloud(texts, ax, n:int=100):
    cut_text = " ".join(texts)
    max_words=n
    word_cloud = WordCloud(
                      background_color='white',
                      stopwords=set(stop_words),
                      max_words=max_words,
                      max_font_size=30,
                      scale=1,
                      colormap='magma',
                      random_state=42).generate(cut_text)
    ax.axis('off')
    ax.imshow(word_cloud)

def sentiment_analyzer(text, ax):
    sid=SentimentIntensityAnalyzer()
    ps=lambda x:sid.polarity_scores(x)
    sentiment_scores=text.apply(ps)

    sentiment_df=pd.DataFrame(data=list(sentiment_scores))
    labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
    sentiment_df['label']=sentiment_df.compound.apply(labelize)

    sentiment_df['label'].value_counts().plot(kind='barh', ax=ax);

In [None]:
def visualize_data(df, column, column_value):
    df_year = df[df[column] == column_value]

    fig, axs = plt.subplots(2, 2, figsize=(15, 10))

    get_top_n_words(df_year['Post'].values, axs[0, 0], n=10)
    axs[0, 0].set_title(f"Top 10 Words in the Top 1% Impression {column_value}")

    word_cloud(df_year['Post'].values, axs[0, 1], n=50)
    axs[0, 1].set_title(f"Word Cloud for Top 50 words in the Top 1% Impression {column_value} period")

    labels = sentiment_analyzer(df_year['Post'], axs[1, 1])
    axs[1, 1].set_title(f"Sentiment Analyzer of Tweets in the Top 1% Impression {column_value} period")

    plt.tight_layout()
    plt.show()


def plot_pos_labels(df, column, pos_tags, rows=2, cols=3, plot=False):

    pos_cols = [f"{pos}_count" for pos in pos_tags]

    if plot == False:

      display(df.groupby(column)[pos_cols+['pos_count']].sum().sort_values(by=['pos_count'],ascending=False))

    else:
      df_grouped = df.groupby(column)[pos_cols].sum()

      fig, axs = plt.subplots(rows, cols, figsize=(15, 10))
      fig.patch.set_facecolor('black')

      axs = axs.flatten()

      for ax, (year, row) in zip(axs, df_grouped.iterrows()):
          ax.pie(row, labels=row.index, autopct='%1.1f%%', colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'], textprops={'color':'#ffffff'})
          ax.set_title(f'POS Tag Distribution in {year}', color='white')

      plt.tight_layout()
      plt.show()

def plot_ner_labels(df, column, ner_tags, rows=2, cols=3, plot=False):

    ner_cols = [f"{ner}_count" for ner in ner_tags]

    if plot == False:

      display(df.groupby(column)[ner_cols+['ner_count']].sum().sort_values(by=['ner_count'],ascending=False))

    else:
      df_grouped = df.groupby(column)[ner_cols].sum()

      fig, axs = plt.subplots(rows, cols, figsize=(15, 10))
      fig.patch.set_facecolor('black')

      axs = axs.flatten()

      for ax, (year, row) in zip(axs, df_grouped.iterrows()):
          ax.pie(row, labels=row.index, autopct='%1.1f%%', colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'], textprops={'color':'#ffffff'})
          ax.set_title(f'NER Tag Distribution in {year}', color='white')

      plt.tight_layout()
      plt.show()

In [None]:
def visualize_labels(df, column, column_values, colors, rows=1, cols=3, length=15, width=10):

  fig, axs = plt.subplots(rows, cols, figsize=(length, width))
  axs = axs.ravel()

  for i, value in enumerate(column_values):

    df[df[column]==value]['topic_label'].value_counts().plot(kind='barh', color=colors[i], ax=axs[i])
    axs[i].set_title(f'{value} Topic Labels')

  plt.tight_layout()
  plt.show()


In [None]:
def barplot_labels(data, plotted_cols, colors, rows=2, cols=4, length=25, width=20):

  fig, axs = plt.subplots(nrows=rows, ncols=cols, figsize=(length, width))
  axs = axs.ravel()

  for i, column in enumerate(plotted_cols):
    df_yearly = data.groupby(['topic_label'])[column].mean()
    df_yearly = df_yearly.sort_values(ascending=True)
    axs[i].barh(df_yearly.index, df_yearly.values, color=colors[i])  # Use color corresponding to column

    axs[i].set_xlabel(column)
    axs[i].set_ylabel('Topic Label')
    axs[i].set_title(f'Distribution of Twitter {column} across Topic Labels', fontsize=10)

  plt.tight_layout()
  plt.show()

In [None]:
def stack_barchart(data, plot_col, length=15, width=10):

  df = data.groupby([plot_col, 'topic_label'])[['topic_label']].count().rename(columns={'topic_label':'topic_count'})
  df = df.reset_index()

  pivot_df = df.pivot(index='topic_label', columns=plot_col, values='topic_count').fillna(0)

  # Plotting
  plt.figure(figsize=(10,7))
  pivot_df.plot(kind='bar', stacked=True, figsize=(10,7))

  plt.xlabel('Topic Label')
  plt.ylabel('Frequency')
  plt.title(f'Distribution of Topic Labels Across Each {plot_col}')
  plt.legend(title=plot_col)

  plt.show()


In [None]:
colors = ['skyblue', 'olive', 'gold', 'purple', 'red', 'green', 'orange', 'brown']

#### Distribution of Whole Dataset

In [None]:
filtered_data['topic_label'].value_counts().plot(kind='bar',title="Distribution of Topic Models");

In [None]:
pos_cols = [f"{pos}_count" for pos in pos_tags]
ner_cols = [f"{ner}_count" for ner in ner_tags]

filtered_data.groupby(['topic_label'])[pos_cols+['pos_count']].mean().sort_values(by=['pos_count'],ascending=False)

In [None]:
filtered_data.groupby(['topic_label'])[ner_cols+['ner_count']].mean().sort_values(by=['ner_count'],ascending=False)

In [None]:
barplot_labels(filtered_data, impression_reach_cols, colors,rows=1,cols=3,length=15,width=10)

In [None]:
barplot_labels(filtered_data, engage_rls_cols, colors,rows=2,cols=2,length=15,width=10)

In [None]:
barplot_labels(filtered_data, click_cols, colors,rows=2,cols=2,length=15,width=10)

In [None]:
nm_cols = num_cols[:3]+num_cols[-2:]
barplot_labels(filtered_data, nm_cols, colors,rows=3,cols=2,length=20,width=15)

In [None]:
stack_barchart(filtered_data, "year")

In [None]:
stack_barchart(filtered_data, "time_period")

In [None]:
stack_barchart(filtered_data, "day_name")

In [None]:
stack_barchart(filtered_data, "day_period")

In [None]:
stack_barchart(filtered_data, "quarter")

In [None]:
wrt = filtered_data[filtered_data['holiday_types']!='Regular Type']
stack_barchart(wrt, "holiday_types")

Top 1% Impressions

In [None]:
last_quantile = filtered_data['Impressions'].quantile(0.99) #Posts that had Impression in the Top 1%
df = filtered_data[cleaned_impressions_reach['Impressions'] > last_quantile]

visualize_data(df, "year", 2017)

In [None]:
visualize_data(df, "year", 2016)

In [None]:
visualize_data(df, "year", 2022)

In [None]:
visualize_labels(df, "year", [2017, 2016, 2022], colors, rows=2, cols=2, length=15, width=10)

In [None]:
plot_pos_labels(df, "year",  pos_tags, plot=True)

In [None]:
plot_ner_labels(df, "year",  ner_tags, plot=False)

In [None]:
visualize_data(df, "time_period", "Morning")

In [None]:
visualize_data(df, "time_period", "Afternoon")

In [None]:
visualize_data(df, "time_period", "Evening")

In [None]:
visualize_labels(df, "time_period", ["Morning", "Afternoon", "Evening"], colors, length=15, width=10)

In [None]:
plot_pos_labels(df, "time_period", pos_tags, rows=2, cols=2, plot=True)

In [None]:
plot_ner_labels(df, "time_period", ner_tags, plot=False)

In [None]:
visualize_data(df, "day_name", "Wednesday")

In [None]:
visualize_data(df, "day_name", "Thursday")

In [None]:
visualize_data(df, "day_name", "Tuesday")

In [None]:
visualize_labels(df, "day_name", ["Wednesday","Thursday", "Tuesday", "Monday"], colors, rows=2, cols=2, length=25, width=20)

In [None]:
plot_pos_labels(df, "day_name", pos_tags, rows=2, cols=3, plot=True)

In [None]:
plot_ner_labels(df, "day_name", ner_tags, plot=False)

#### Top 1% Engagements

In [None]:
last_quantile = filtered_data['Engagements'].quantile(0.99) #Posts that had Impression in the Top 1%
df = filtered_data[cleaned_impressions_reach['Engagements'] > last_quantile]

visualize_data(df, "year", 2020)

In [None]:
visualize_data(df, "year", 2019)

In [None]:
visualize_data(df, "year", 2023)

In [None]:
visualize_labels(df, "year", [2020, 2019, 2023], colors, rows=1, cols=3, length=15, width=10)