## Preprocess the data

This model parses the Media Frames Corpus data and calculates for a calculates the bias for the defined set of microframes for each sentence.

In [4]:
!pip install nltk

[0m

In [5]:
import nltk
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [6]:
import os

os.listdir(os.getcwd())

['FRISS_srl.pkl',
 'training_metrics.csv',
 'README.md',
 'notebooks',
 'chunks.pkl',
 'grid_search_metrics.csv',
 '.git',
 'assets',
 'test.csv',
 'friss',
 'models',
 '.ipynb_checkpoints',
 'data',
 '.gitignore',
 'frameaxis']

In [7]:
labels_path = "data/mfc/immigration_labeled.json"
codes_path = "data/mfc/codes.json"

In [8]:
# load data from path 
import json

with open(labels_path) as f:
    labels = json.load(f)

with open(codes_path) as f:
    codes = json.load(f)

In [9]:
import pandas as pd
from nltk.tokenize import sent_tokenize

# articles list
articles_list = []

# Iterate through the data to fill the DataFrame
for article_id, article_data in labels.items():
    annotations_data = article_data['annotations']

    irrelevant_dict = annotations_data['irrelevant']

    text = article_data['text']
    irrelevant = article_data['irrelevant']

    # if primary_frame is none set to 15.0
    if article_data['primary_frame'] is not None:
        primary_frame = str(article_data['primary_frame']).split(".")[0] + ".0"
    else:
        primary_frame = "15.0"

    # get primary frame from code
    primary_frame = str(codes[primary_frame])

    # split text into sentences using nltk library
    sentences = sent_tokenize(text)

    # iterate through sentences
    for sentence in sentences:
        article = {
            'article_id': article_id,
            'irrelevant': irrelevant,
            'text': sentence,
            'document_frame': primary_frame
        }

        articles_list.append(article)

# Create a DataFrame to store the results
df = pd.DataFrame(articles_list, columns=['article_id', 'irrelevant', 'text', 'document_frame'])


In [10]:
df

Unnamed: 0,article_id,irrelevant,text,document_frame
0,Immigration1.0-10005,0.0,IMM-10005\n\nPRIMARY\n\nImmigrants without HOP...,Quality of Life
1,Immigration1.0-10005,0.0,It mounted as students went around the room te...,Quality of Life
2,Immigration1.0-10005,0.0,Georgia Tech.,Quality of Life
3,Immigration1.0-10005,0.0,University of Georgia.,Quality of Life
4,Immigration1.0-10005,0.0,"""All I could say was, 'I'm planning to see if ...",Quality of Life
...,...,...,...,...
74463,Immigration1.0-9998,0.0,"Sue Brown, spokeswoman for the INS, said it's ...",Crime and Punishment
74464,Immigration1.0-9998,0.0,"""They love it,"" she said.",Crime and Punishment
74465,Immigration1.0-9998,0.0,"""They use these units to interview the people,...",Crime and Punishment
74466,Immigration1.0-9998,0.0,"""We do about 15 interviews a day,"" Brown said.",Crime and Punishment


In [11]:
df = df[df["irrelevant"] == False][["article_id", "text", "document_frame"]]

In [12]:
df.head()

Unnamed: 0,article_id,text,document_frame
0,Immigration1.0-10005,IMM-10005\n\nPRIMARY\n\nImmigrants without HOP...,Quality of Life
1,Immigration1.0-10005,It mounted as students went around the room te...,Quality of Life
2,Immigration1.0-10005,Georgia Tech.,Quality of Life
3,Immigration1.0-10005,University of Georgia.,Quality of Life
4,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...",Quality of Life


In [13]:
# create for each code a col and fill with 1 if code is in code col
df = pd.concat([df, pd.get_dummies(df['document_frame'])], axis=1)

In [14]:
df.head()

Unnamed: 0,article_id,text,document_frame,Capacity and Resources,Crime and Punishment,Cultural Identity,Economic,External Regulation and Reputation,Fairness and Equality,Health and Safety,"Legality, Constitutionality, Jurisdiction",Morality,Other,Policy Prescription and Evaluation,Political,Public Sentiment,Quality of Life,Security and Defense
0,Immigration1.0-10005,IMM-10005\n\nPRIMARY\n\nImmigrants without HOP...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Immigration1.0-10005,It mounted as students went around the room te...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Immigration1.0-10005,Georgia Tech.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,Immigration1.0-10005,University of Georgia.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...",Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [15]:
df.shape

(67480, 18)

# FrameAxis

In [13]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from multiprocessing import Pool, cpu_count
from tqdm.notebook import tqdm

In [14]:
class ContextualEmbeddingAnalyzer:
    def __init__(self, antonym_pairs, model_name='bert-base-uncased'):
        self.antonym_pairs = antonym_pairs
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained(model_name)
        if torch.cuda.is_available():
            print("Using CUDA")
            self.model.cuda()

    def process_dataframe(self, df, remove_stopwords=True):
        print("Preprocessing DataFrame")
        stop_words = set(stopwords.words('english')) if remove_stopwords else set()

        # Preprocess text: tokenize, remove stopwords and non-alphabetic words, rejoin into string
        df['text'] = df['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.isalpha() and word.lower() not in stop_words]))
        
        print("Calculating Cosine Similarities")

        # Calculate cosine similarities
        return self.calculate_cosine_similarities(df)

    def calculate_cosine_similarities(self, df):
        def process_row(row):
            # Ensure the sentence embeddings are on the same device as the model
            sentence_embeddings = self.get_embeddings(row['text']).to(self.model.device)
            cos_sims = {}

            for pos_word, neg_word in self.antonym_pairs:
                pos_embedding = self.get_embedding(pos_word).to(self.model.device)
                neg_embedding = self.get_embedding(neg_word).to(self.model.device)
                diff_vector = neg_embedding - pos_embedding

                sims = []
                for word_embedding in sentence_embeddings:
                    # Ensure the word embedding is on the same device as the diff_vector
                    word_embedding = word_embedding.to(self.model.device)
                    cos_sim = 1 - cosine_similarity(
                        diff_vector.cpu().numpy().reshape(1, -1),
                        word_embedding.cpu().numpy().reshape(1, -1)
                    )[0][0]
                    sims.append(cos_sim)

                cos_sims[f"{pos_word}_{neg_word}"] = np.mean(sims)

            return pd.Series(cos_sims)

        # Apply the function to each row with tqdm for progress tracking
        tqdm.pandas(desc="Calculating Cosine Similarities")
        cos_sim_columns = df.progress_apply(process_row, axis=1)

        # Join the results back to the original DataFrame
        return df.join(cos_sim_columns)
        
    def get_embeddings(self, text):
        # Tokenize and get embeddings for each word in the text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}  # Move inputs to the model's device

        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.squeeze(0)

    def get_embedding(self, word):
        # Tokenize and get embedding for a single word
        inputs = self.tokenizer(word, return_tensors="pt")
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}  # Move inputs to the model's device
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        return outputs.last_hidden_state.squeeze(0).mean(dim=0)


In [15]:
# read frameaxis\axes\732_semaxis_axes.tsv into a dataframe
axes_df = pd.read_csv("frameaxis/axes/custom.tsv", sep="\t", header=None)

# transform the dataframe into tuples list
antonym_pairs = [tuple(x) for x in axes_df.values]

In [16]:
# define 3 equal size chunks and save article_id as list with chunk identifier
chunks = np.array_split(df["article_id"].unique(), 5)
chunks = [list(chunk) for chunk in chunks]

In [17]:
import pickle
# save chunks as pickle
with open("chunks.pkl", 'wb') as f:
    pickle.dump(chunks, f)

In [18]:
import pickle
# load pickle
with open("chunks.pkl", 'rb') as f:
    chunks = pickle.load(f)

In [19]:
# filter df by chunks
df = df[df["article_id"].isin(chunks[0])]

In [20]:
len(df)

13424

In [21]:
bert_model_path = "bert-base-uncased"

# Example usage
nltk.download('punkt')
nltk.download('stopwords')

analyzer = ContextualEmbeddingAnalyzer(antonym_pairs, model_name='bert-base-uncased')
frameaxis_df = analyzer.process_dataframe(df)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using CUDA
Preprocessing DataFrame
Calculating Cosine Similarities


Calculating Cosine Similarities:   0%|          | 0/13424 [00:00<?, ?it/s]

In [None]:
# save frameaxis_df to csv as _chunk1
frameaxis_df.to_csv("data/mfc/frameaxis_df_chunk1.csv")

In [2]:
import pandas as pd

# read frameaxis_df_chunk{n}.csv where n in {1,2,3,4,5} into a dataframe and concat to one dataframe
frameaxis_df = pd.concat([pd.read_csv(f"data/mfc/frameaxis_df_chunk{n}.csv") for n in range(1, 6)])

frameaxis_df.head()

Unnamed: 0.1,Unnamed: 0,article_id,text,document_frame,Capacity and Resources,Crime and Punishment,Cultural Identity,Economic,External Regulation and Reputation,Fairness and Equality,...,rational_irrational,objective_subjective,legitimate_illegitimate,inclusive_exclusive,productive_unproductive,professional_unprofessional,realistic_unrealistic,consistent_inconsistent,relevant_irrelevant,sophisticated_unsophisticated
0,0,Immigration1.0-10005,PRIMARY Immigrants without HOPE need help ente...,Quality of Life,0,0,0,0,0,0,...,0.999872,0.926197,1.049736,1.021506,0.957948,0.8857,0.864725,0.954054,1.024497,0.902283
1,1,Immigration1.0-10005,mounted students went around room telling Broo...,Quality of Life,0,0,0,0,0,0,...,0.965384,0.959009,1.049423,0.98196,0.991059,0.925184,0.911629,0.989337,1.001607,0.922871
2,2,Immigration1.0-10005,Georgia Tech,Quality of Life,0,0,0,0,0,0,...,0.919163,0.96647,0.946616,0.944798,0.948911,1.010055,0.98284,0.931321,1.049317,1.014321
3,3,Immigration1.0-10005,University Georgia,Quality of Life,0,0,0,0,0,0,...,0.940292,0.99476,1.01464,0.924419,1.119473,1.122521,1.117594,0.9394,1.035857,1.124428
4,4,Immigration1.0-10005,could say planning see get college situation r...,Quality of Life,0,0,0,0,0,0,...,1.00299,0.988053,1.117766,1.010197,1.059625,0.973413,0.991474,0.984446,1.009764,0.978467


In [17]:
frameaxis_df.columns

Index(['Unnamed: 0', 'article_id', 'text', 'document_frame',
       'Capacity and Resources', 'Crime and Punishment', 'Cultural Identity',
       'Economic', 'External Regulation and Reputation',
       'Fairness and Equality', 'Health and Safety',
       'Legality, Constitutionality, Jurisdiction', 'Morality', 'Other',
       'Policy Prescription and Evaluation', 'Political', 'Public Sentiment',
       'Quality of Life', 'Security and Defense', 'beneficial_harmful',
       'honest_dishonest', 'progressive_regressive', 'transparent_opaque',
       'constructive_destructive', 'informed_uninformed', 'ethical_unethical',
       'authentic_inauthentic', 'tolerant_intolerant',
       'responsible_irresponsible', 'rational_irrational',
       'objective_subjective', 'legitimate_illegitimate',
       'inclusive_exclusive', 'productive_unproductive',
       'professional_unprofessional', 'realistic_unrealistic',
       'consistent_inconsistent', 'relevant_irrelevant',
       'sophisticated_uns

In [18]:
drop_columns = ['Unnamed: 0', 'text', 'document_frame',
       'Capacity and Resources', 'Crime and Punishment', 'Cultural Identity',
       'Economic', 'External Regulation and Reputation',
       'Fairness and Equality', 'Health and Safety',
       'Legality, Constitutionality, Jurisdiction', 'Morality', 'Other',
       'Policy Prescription and Evaluation', 'Political', 'Public Sentiment',
       'Quality of Life', 'Security and Defense']

frameaxis_df = frameaxis_df.drop(drop_columns, axis=1)

In [25]:
frameaxis_df.columns

Index(['article_id', 'beneficial_harmful', 'honest_dishonest',
       'progressive_regressive', 'transparent_opaque',
       'constructive_destructive', 'informed_uninformed', 'ethical_unethical',
       'authentic_inauthentic', 'tolerant_intolerant',
       'responsible_irresponsible', 'rational_irrational',
       'objective_subjective', 'legitimate_illegitimate',
       'inclusive_exclusive', 'productive_unproductive',
       'professional_unprofessional', 'realistic_unrealistic',
       'consistent_inconsistent', 'relevant_irrelevant',
       'sophisticated_unsophisticated'],
      dtype='object')

In [24]:
# save frameaxis_df to csv as _all
frameaxis_df.to_csv("data/frameaxis/mfc/frameaxis_df_all.csv", index=False)

In [26]:
# read frameaxis_df_all.csv into a dataframe
frameaxis_df_temp = pd.read_csv("data/frameaxis/mfc/frameaxis_df_all.csv")

In [27]:
frameaxis_df_temp.head()

Unnamed: 0,article_id,beneficial_harmful,honest_dishonest,progressive_regressive,transparent_opaque,constructive_destructive,informed_uninformed,ethical_unethical,authentic_inauthentic,tolerant_intolerant,...,rational_irrational,objective_subjective,legitimate_illegitimate,inclusive_exclusive,productive_unproductive,professional_unprofessional,realistic_unrealistic,consistent_inconsistent,relevant_irrelevant,sophisticated_unsophisticated
0,Immigration1.0-10005,1.050137,0.853,0.860336,1.019595,1.020722,0.867933,0.869395,0.902169,0.919827,...,0.999872,0.926197,1.049736,1.021506,0.957948,0.8857,0.864725,0.954054,1.024497,0.902283
1,Immigration1.0-10005,1.019979,0.905785,0.925394,1.03109,1.037501,0.927932,0.896408,0.94621,0.925442,...,0.965384,0.959009,1.049423,0.98196,0.991059,0.925184,0.911629,0.989337,1.001607,0.922871
2,Immigration1.0-10005,0.948202,0.95088,1.042683,1.071184,1.110367,0.976848,0.960112,0.944536,0.990828,...,0.919163,0.96647,0.946616,0.944798,0.948911,1.010055,0.98284,0.931321,1.049317,1.014321
3,Immigration1.0-10005,0.937088,1.106904,1.110138,1.041274,1.058978,1.092998,1.117327,1.081627,1.122703,...,0.940292,0.99476,1.01464,0.924419,1.119473,1.122521,1.117594,0.9394,1.035857,1.124428
4,Immigration1.0-10005,1.078151,0.971434,0.926846,1.027939,1.042174,0.963643,0.952304,1.036815,1.03918,...,1.00299,0.988053,1.117766,1.010197,1.059625,0.973413,0.991474,0.984446,1.009764,0.978467
