## Main Analysis

### Set up environment and load data

In [None]:
pip install transformers sentencepiece accelerate safetensors datasets torchvision beautifulsoup

In [2]:
import pandas as pd
import numpy as np
from transformers import pipeline 
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
import pandas as pd

# Load data collected in R (wherever it was saved, either Google Drive or locally)

# Remove lowtrust posts from users timelines to avoid leakages 
user_timelines_covid = user_timelines_covid[~user_timelines_covid['id'].isin(covid_lowtrust_tweets['id'])]
user_timelines_climate = user_timelines_climate[~user_timelines_climate['id'].isin(climate_lowtrust_tweets['id'])]

###  Stratification I - Sentiment Classification

#### Load FLAN T5-XXL 

In [None]:
# Load the model from HuggingFace
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto")

#### Check Model Accuracy using Sentiment140 Dataset


In [None]:
from datasets import load_dataset 
import pandas as pd

# Import the benchmarking dataset
twtsent_benchmark = load_dataset("sentiment140")
twtsent_benchmark = twtsent_benchmark['train']
twtsent_benchmark = pd.DataFrame(twtsent_benchmark)

sentiment_mapping = {
    0: 'negative',
    2: 'neutral',
    4: 'positive'
}

# Sentiment140 uses numeric values to classify sentiment (0 is negative, 2 is neutral and 4 is positive). Here, we change that to the text label 
twtsent_benchmark['sentiment'] = twtsent_benchmark['sentiment'].replace(sentiment_mapping)

In [None]:
import time
import torch 

start_time = time.time()

def sentiment_analysis_benchmark(dataset, model, tokenizer, batch_size=128):
    results = []
    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset.iloc[i:i+batch_size]
            input_texts = ["Classify the sentiment of the following tweet as positive, negative or neutral. Tweet: " + text for text in batch['text']]
            input_ids = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")
            outputs = model.generate(input_ids)
            labels = [tokenizer.decode(output) for output in outputs]
            for j, label in enumerate(labels):
                index = i + j
                results.append({"index": index, "zs_label": label, "original_label": batch['sentiment'].iloc[j]})
            del batch, input_texts, input_ids, outputs, labels
    return pd.DataFrame(results)


twtsent_benchmark_speedtest = sentiment_analysis_benchmark(twtsent_benchmark, model, tokenizer, batch_size=128)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Flan outputs sentiment labels with >< around them, here we remove them to only keep the raw label

def extract_middle_word(text):
    return text.split("> ")[1].split("</")[0]

twtsent_benchmark_speedtest['zs_label'] = twtsent_benchmark_speedtest['zs_label'].apply(extract_middle_word)

In [None]:
from sklearn.metrics import matthews_corrcoef

raw_accuracy = (twtsent_benchmark_speedtest["original_label"] == twtsent_benchmark_speedtest["zs_label"]).mean()
print("Accuracy:", raw_accuracy)

matthews_corr = matthews_corrcoef(twtsent_benchmark_speedtest['original_label'],twtsent_benchmark_speedtest['zs_label'])
print("Matthews Correlation Coefficient:", matthews_corr)

#### Apply Sentiment Analysis to Twitter Data

In [None]:
import pandas as pd
import re

def preprocess_text(df):
    # Remove URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    df['text'] = df['text'].apply(lambda x: re.sub(url_pattern, '', x))

    # Remove hashtags
    hashtag_pattern = r'#[a-zA-Z0-9_]+'
    df['text'] = df['text'].apply(lambda x: re.sub(hashtag_pattern, '', x))

    # Remove mentions and the words following @
    mention_pattern = r'@[a-zA-Z0-9_]+'
    df['text'] = df['text'].apply(lambda x: re.sub(mention_pattern, '', x))

    # Remove special characters like "
    special_chars_pattern = r'["]'
    df['text'] = df['text'].apply(lambda x: re.sub(special_chars_pattern, '', x))

    # Remove extra whitespaces
    df['text'] = df['text'].apply(lambda x: ' '.join(x.split()))

    # Remove the text in rows where there is only one word
    df['text'] = df['text'].apply(lambda x: '' if len(x.split()) == 1 else x)

    return df

In [None]:
covid_lowtrust_tweets = preprocess_text(covid_lowtrust_tweets)
climate_lowtrust_tweets = preprocess_text(climate_lowtrust_tweets)
user_timelines_covid = preprocess_text(user_timelines_covid)
user_timelines_climate = preprocess_text(user_timelines_climate)

In [None]:
# This function is different from the previous one as we don't have benchmarking labels, so we only keep the label output from the T5 model. This function should smoothly on a 80gb GPU, but batch sizes can be changed if needed. It is also possible to run it on a 40gb GPU with smaller batch-sizes and a longer computing time. 

import time
import torch

def sentiment_classifier(dataset, model, tokenizer, batch_size):
    results = []
    with torch.no_grad():
        for i in range(0, len(dataset), batch_size):
            batch = dataset.iloc[i:i+batch_size]
            input_texts = ["Classify the sentiment of the following tweet as positive, negative or neutral. Tweet: " + text for text in batch['text']]
            input_ids = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).input_ids.to("cuda")
            outputs = model.generate(input_ids)
            labels = [tokenizer.decode(output) for output in outputs]
            for j, label in enumerate(labels):
                index = i + j
                results.append({"id": batch.iloc[j]['id'], "sentiment_label": label})
            del batch, input_texts, input_ids, outputs, labels
    return pd.DataFrame(results)

In [None]:
# After defining the sentiment classifier, we can now apply it to the four datasets

def classify_sentiment(dataset, model, tokenizer, batch_size):

    start_time = time.time()
    
    def extract_middle_word(text):
     return text.split("> ")[1].split("</")[0]

    results_df = sentiment_classifier(dataset, model, tokenizer, batch_size)
    results_df['sentiment_label'] = results_df['sentiment_label'].apply(extract_middle_word)
    merged_df = pd.merge(dataset, results_df, on="id")
    print("--- %s seconds ---" % (time.time() - start_time))
    return merged_df

covid_lowtrust_tweets = classify_sentiment(covid_lowtrust_tweets, model, tokenizer, batch_size=64)
climate_lowtrust_tweets = classify_sentiment(climate_lowtrust_tweets, model, tokenizer, batch_size=64)
user_timelines_covid = classify_sentiment(user_timelines_covid, model, tokenizer, batch_size=64)
user_timelines_climate = classify_sentiment(user_timelines_climate, model, tokenizer, batch_size=64)

### Sample Stratification II - Engagement Level

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def label_engagement_kmeans(lowtrust_tweets, user_timelines):
    # Add a 'type' column to lowtrust_tweets and user_timelines dataframes
    lowtrust_tweets['type'] = 'lowtrust_tweets'
    user_timelines['type'] = 'user_timelines'

    # Merge the lowtrust tweets and user timelines dataframes
    merged_df = pd.concat([lowtrust_tweets, user_timelines])

    # Convert target columns to numeric and replace missing values with 0
    target_cols = ['retweet_count', 'reply_count', 'like_count', 'quote_count']
    merged_df[target_cols] = merged_df[target_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

    # Apply weights for each type of engagement
    weights = {'retweet_count': 2, 'reply_count': 54, 'like_count': 1, 'quote_count': 2}
    for col in target_cols:
        merged_df[col] = merged_df[col] * weights[col]

    # Apply Box-Cox transformation to target columns
    for col in target_cols:
        transformed_data, _ = boxcox(merged_df[col] + 1) # add 1 to handle 0 values
        merged_df[col] = transformed_data

    # Calculate total weighted engagement for each post
    merged_df['total_engagement'] = merged_df[target_cols].sum(axis=1)

    # Scale the total_engagement data
    scaler = StandardScaler()
    scaled_engagement = scaler.fit_transform(merged_df[['total_engagement']])

    # Use KMeans clustering to detect 3 engagement levels
    kmeans = KMeans(n_clusters=3, random_state=42).fit(scaled_engagement)
    merged_df['engagement_level'] = kmeans.labels_

    # Calculate the mean total engagement for each engagement level
    mean_engagement = merged_df.groupby('engagement_level')['total_engagement'].mean()

    # Label the engagement levels based on mean total engagement
    label_map = {idx: 'low_engagement' if val <= mean_engagement.min() else 'high_engagement' if val >= mean_engagement.max() else 'mid_engagement' for idx, val in mean_engagement.items()}
    merged_df['engagement_level'] = merged_df['engagement_level'].map(label_map)

    # Split the merged dataframe back into lowtrust_tweets and user_timelines dataframes
    merged_df = merged_df.groupby('type')
    lowtrust_tweets = merged_df.get_group('lowtrust_tweets')
    user_timelines = merged_df.get_group('user_timelines')

    return lowtrust_tweets, user_timelines

climate_lowtrust_tweets, user_timelines_climate = label_engagement_kmeans(climate_lowtrust_tweets, user_timelines_climate)
covid_lowtrust_tweets, user_timelines_covid = label_engagement_kmeans(covid_lowtrust_tweets, user_timelines_covid)

### Stratify Sample and Compare Impressions Metric

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

def scale_data(dataframe):
    scaler = RobustScaler(quantile_range=(25, 75))
    dataframe['impressions_count'] = scaler.fit_transform(dataframe[['impressions_count']])
    dataframe['impressions_count'] += abs(min(dataframe['impressions_count'].min(), 0))
    return dataframe

def calculate_average_impressions_by_user(user_timelines):
    user_timelines = user_timelines.copy()
    user_timelines['stratum'] = user_timelines['sentiment_label'] + '.' + user_timelines['engagement_level'].str.lower()
    results = user_timelines.groupby(['author_id', 'stratum'])['impressions_count'].median().reset_index()
    results.rename(columns={'impressions_count': 'average_impressions'}, inplace=True)
    return results

def performance_computation(lowtrust_tweets, user_timelines):
    lowtrust_tweets_scaled = scale_data(lowtrust_tweets.copy())
    user_timelines_scaled = scale_data(user_timelines.copy())
    average_impressions_by_user = calculate_average_impressions_by_user(user_timelines_scaled)
    lowtrust_tweets_scaled['stratum'] = lowtrust_tweets_scaled['sentiment_label'] + '.' + lowtrust_tweets_scaled['engagement_level'].str.lower()
    merged_data = lowtrust_tweets_scaled.merge(average_impressions_by_user, on=['author_id', 'stratum'], how='left')

    min_non_zero_value_user = merged_data[merged_data['average_impressions'] > 0]['average_impressions'].min()
    min_non_zero_value_lowt = merged_data[merged_data['impressions_count'] > 0]['impressions_count'].min()
    merged_data['average_impressions'] = np.where(merged_data['average_impressions'] == 0, min_non_zero_value_user, merged_data['average_impressions'])
    merged_data['impressions_count'] = np.where(merged_data['impressions_count'] == 0, min_non_zero_value_lowt, merged_data['impressions_count'])

    merged_data['impressions_performance'] = ((merged_data['impressions_count'] - merged_data['average_impressions']) / merged_data['average_impressions']) * 100
    q005, q995 = merged_data['impressions_performance'].quantile([0.005, 0.995])
    merged_data = merged_data.loc[(merged_data['impressions_performance'] >= q005) & (merged_data['impressions_performance'] <= q995)]

    return merged_data

In [None]:
# Apply the function to the covid and climate dataframes
covid_results = calculate_impressions_performance(covid_lowtrust_tweets,user_timelines_covid)
climate_results = calculate_impressions_performance(climate_lowtrust_tweets,user_timelines_climate)

### Scrape updated MBFC Bias Ratings 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_bias_ratings_for_dataframe_full(df):
    # Function to fetch the MBFC URL for a given domain
    def get_mbfc_url(domain):
        search_url = f"https://mediabiasfactcheck.com/?s={domain}"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.text, "html.parser")
        search_result = soup.select_one("h3.entry-title a")
        if search_result:
            return search_result["href"]
        return None

    def get_bias_rating(mbfc_url):
        response = requests.get(mbfc_url)
        soup = BeautifulSoup(response.text, "html.parser")
        bias_ratings = ["LEFT", "RIGHT", "RIGHT-EXTREME", "LEFT-EXTREME", "CENTER", "FAR-RIGHT", "FAR-LEFT", "RIGHT-CENTER", "LEFT-CENTER", "FAR RIGHT", "FAR LEFT", "RIGHT EXTREME", "LEFT EXTREME", "RIGHT CENTER", "LEFT CENTER", "Far right", "far right"]
        rating_div = soup.find("div", class_="entry-content")
        if rating_div:
            text = rating_div.get_text()

            # Sort the bias_ratings list by length in descending order
            bias_ratings.sort(key=len, reverse=True)

            for rating in bias_ratings:
                if rating in text:
                    return rating

        return None


    # Create a dictionary to store the bias ratings for each domain
    domain_bias_dict = {}

    # Iterate through the rows of the dataframe
    for idx, row in df.iterrows():
        domain = row['domains']
        # If the domain has not been looked up before, fetch its bias rating
        if domain not in domain_bias_dict:
            print(f"Fetching bias rating for domain: {domain}")
            mbfc_url = get_mbfc_url(domain)
            if mbfc_url:
                print(f"MBFC URL found for domain: {domain}")
                bias_rating = get_bias_rating(mbfc_url)
                if bias_rating:
                    print(f"Bias rating found for domain {domain}: {bias_rating}")
                    domain_bias_dict[domain] = bias_rating
                else:
                    print(f"No bias rating found for domain: {domain}")
                    domain_bias_dict[domain] = None
            else:
                print(f"No MBFC URL found for domain: {domain}")
                domain_bias_dict[domain] = None

        # Update the row with the domain's bias rating
        bias_rating = domain_bias_dict[domain]
        if bias_rating == None:
            bias_rating = 'Bias Rating Not Found'
        df.at[idx, 'bias_rating'] = bias_rating

    return df


In [None]:
# Apply the function to the dataframes
results_climate = get_bias_ratings_for_dataframe_full(results_climate)
results_covid = get_bias_ratings_for_dataframe_full(results_covid)

# Some values are worded differently in MBFC, so we need to modify them
results_covid['bias_rating'] = results_covid['bias_rating'].replace(["FAR-RIGHT","far right"], 'FAR RIGHT')
results_climate['bias_rating'] = results_climate['bias_rating'].replace(["FAR-RIGHT","far right"], 'FAR RIGHT')

### Topic Modelling with Bertopic + GPT3.5 on Top Performers - Not Yet Implemented

In [None]:
import pandas as pd
import re
import openai
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.representation import OpenAI
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

def topic_modeling(data, min_cluster_size, min_samples):
    # get 75th percentile of 'impressions_performance' column in dataframe
    pct_75 = data['impressions_performance'].quantile(0.75)
    
    # select all rows where the column value is greater than or equal to the 75th percentile value
    selected_rows = data[data['impressions_performance'] >= pct_75]
    
    # define a function to clean tweet text
    def clean_tweet(tweet):
        tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
        tweet = re.sub(r'#\w+', '', tweet)  # Remove hashtags
        tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
        tweet = re.sub(r'\s+', ' ', tweet)  # Remove extra spaces
        tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
        tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)  # Remove non-ASCII characters
        tweet = tweet.strip()  # Remove extra whitespace
        return tweet
    
    # apply clean_tweet function to 'text' column of selected rows
    selected_rows['text'] = selected_rows['text'].apply(clean_tweet)
    
    # filter out rows where 'text' column has zero length
    mask = selected_rows['text'].str.len() >= 1
    selected_rows = selected_rows.loc[mask]
    
    # create list of tweet text strings
    data_list = selected_rows['text'].tolist()
    
    # download stopwords
    nltk.download('stopwords')
    
    # set up stopwords list
    stopwords_list = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']
    
    # set up CountVectorizer with stopword list
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords_list)
    
    # set up OpenAI representation model
    openai.api_key = "YOUR_API_KEY"
    prompt = """
    I have a topic that contains the following documents: 
    [DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS]

    Based on the information above, extract a very short topic label in the following format:
    topic: <topic label>
    """
    representation_model = OpenAI(model="gpt-3.5-turbo", delay_in_seconds=10, chat=True, prompt=prompt)
    
    # set up SentenceTransformer embedding model and UMAP and HDBSCAN clustering models
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05, random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, gen_min_span_tree=True, prediction_data=True)
    
    # set up BERTopic model with above components and fit on tweet text data
    model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        top_n_words=15,
        language='english',
        calculate_probabilities=True,
        verbose=True
        )
    
    topics, probs = model.fit_transform(data_list)
    return topics, probs, model