# Prepare datasets

Read the dataset, split the articles into sentences, clean the sentences and save the cleaned sentences into a new file.

## Media Frames Corpus (MFC) dataset

In [1]:
import pandas as pd

In [2]:
immigration_path = "../../data/mfc/immigration_labeled.json"
deathpenalty_path = "../../data/mfc/deathpenalty_labeled.json"
guncontrol_path = "../../data/mfc/guncontrol_labeled.json"
samesex_path = "../../data/mfc/samesex_labeled.json"
tobacco_path = "../../data/mfc/tobacco_labeled.json"
frames_code = "../../data/mfc/codes.json"

In [3]:
# read unlabeled data
import json

# read labeled data
with open(immigration_path) as f:
    immigration = json.load(f)

with open(deathpenalty_path) as f:
    deathpenalty = json.load(f)

with open(guncontrol_path) as f:
    guncontrol = json.load(f)

with open(samesex_path) as f:
    samesex = json.load(f)

with open(tobacco_path) as f:
    tobacco = json.load(f)

# read frames
with open(frames_code) as f:
    codes = json.load(f)

In [22]:
def get_frame_name(code, remove_suffix=False):
    if "." not in str(code):
        if remove_suffix:
            return codes[str(code)].replace(" primary", "").replace(" primany", "")
        else:
            return codes[str(code) + ".2"]
    
    if remove_suffix:
        return codes[str(code)].replace(" primary", "").replace(" primany", "")
    else:
        return codes[str(code)]

def get_tone_name(code):
    if code == "17":
        return "Pro"
    elif code == "18":
        return "Neutral"
    elif code == "19":
        return "Anti"
    else:
        return codes[code]

In [23]:
# generate statistics count
print("immigration data count: ", len(immigration))
print("deathpenalty data count: ", len(deathpenalty))
print("guncontrol data count: ", len(guncontrol))
print("samesex data count: ", len(samesex))
print("tobacco data count: ", len(tobacco))

immigration data count:  6757
deathpenalty data count:  6398
guncontrol data count:  6689
samesex data count:  10583
tobacco data count:  5274


In [55]:
def process_data(json_file):
    # Load JSON data from a file
    with open(json_file, 'r') as file:
        data = json.load(file)

    # Define the frame names as columns
    frame_columns = ['Capacity and Resources', 'Crime and Punishment', 'Cultural Identity', 'Economic',
                     'External Regulation and Reputation', 'Fairness and Equality', 'Health and Safety',
                     'Legality, Constitutionality, Jurisdiction', 'Morality', 'Other',
                     'Policy Prescription and Evaluation', 'Political', 'Public Sentiment',
                     'Quality of Life', 'Security and Defense']

    # Initialize an empty list to collect DataFrame rows
    rows = []
    no_frames_counter = 0
    irrelevant_counter = 0

    # Populate the DataFrame
    for key, value in data.items():
        # if there is no primary frame or irrelevant = 1 then skip
        if value["primary_frame"] == None or value['irrelevant'] == 1:
            if [value['irrelevant'] == 1]:
                irrelevant_counter += 1
            if value["primary_frame"] == None:
                no_frames_counter += 1
            
            continue

        row = {
            'article_id': key,
            'text': value['text'],
            'document_frame': get_frame_name(value['primary_frame'], remove_suffix=True)
        }

        # Initialize all frame columns to 0
        for frame in frame_columns:
            row[frame] = 0
        
        # Set the specific frame to 1 based on the primary_frame
        frame_name = get_frame_name(value['primary_frame'], remove_suffix=True)
        row[frame_name] = 1
        rows.append(row)
    
    print("No frames: ", no_frames_counter)
    print("Irrelevant: ", irrelevant_counter)
    print("Total correct: ", len(rows))
    print("Total: ", len(data))

    # Create DataFrame from list of rows
    df = pd.DataFrame(rows, columns=['article_id', 'text', 'document_frame'] + frame_columns)
    return df

In [56]:
imm_df = process_data(immigration_path)

No frames:  824
Irrelevant:  825
Total correct:  5932
Total:  6757


In [64]:
import re

def preprocess_text(text):
    text = text.replace("\n\n", ". ")
    text = text.replace(".. ", ". ")
    #text = text.replace("\t", " ")
    text = text.replace("  ", " ")
    text = text.strip()

    # some texts start with "IMM-XXXXX PRIMARY" remove
    text = re.sub(r"^IMM-\d+. PRIMARY. ", "", text)

    # remove leading and trailing whitespaces
    text = text.strip()

    return text

imm_df["text"] = imm_df["text"].apply(preprocess_text)

In [72]:
imm_df.to_json("../../data/mfc/immigration_labeled_preprocessed_unsplitted.json")

In [66]:
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK's punkt tokenizer is downloaded
nltk.download('punkt')

# Prepare tqdm for pandas
tqdm.pandas(desc="Tokenizing sentences")

def expand_row(row):
    # This function takes a row, tokenizes the 'text' field, and returns a DataFrame of the new rows
    sentences = sent_tokenize(row['text'])
    # Create a new DataFrame for each sentence with the same data as the original row
    return pd.DataFrame({
        'article_id': [row['article_id']] * len(sentences),
        'text': sentences,
        'document_frame': [row['document_frame']] * len(sentences),
        **{col: [row[col]] * len(sentences) for col in row.index if col not in ['article_id', 'text', 'document_frame']}
    })

def split_sentences_in_df(df):
    # Apply the expand_row function to each row and collect all resulting DataFrames into a list
    list_of_dataframes = df.progress_apply(expand_row, axis=1)
    # Concatenate all DataFrames from the list into a single DataFrame
    new_df = pd.concat(list_of_dataframes.tolist(), ignore_index=True)
    return new_df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
df = split_sentences_in_df(imm_df)

Tokenizing sentences:   0%|          | 0/5932 [00:00<?, ?it/s]

In [69]:
# save df to ../data/mfc/immigration_labeled_preprocessed.json
df.to_json("../../data/mfc/immigration_labeled_preprocessed.json")