# Prepare datasets

Read the dataset, split the articles into sentences, clean the sentences and save the cleaned sentences into a new file.

## Media Frames Corpus (MFC) dataset

### Labeled

In [1]:
import pandas as pd

In [2]:
immigration_path = "../../data/mfc/immigration_labeled.json"
deathpenalty_path = "../../data/mfc/deathpenalty_labeled.json"
guncontrol_path = "../../data/mfc/guncontrol_labeled.json"
samesex_path = "../../data/mfc/samesex_labeled.json"
tobacco_path = "../../data/mfc/tobacco_labeled.json"
frames_code = "../../data/mfc/codes.json"

In [3]:
# read unlabeled data
import json

# read labeled data
with open(immigration_path) as f:
    immigration = json.load(f)

with open(deathpenalty_path) as f:
    deathpenalty = json.load(f)

with open(guncontrol_path) as f:
    guncontrol = json.load(f)

with open(samesex_path) as f:
    samesex = json.load(f)

with open(tobacco_path) as f:
    tobacco = json.load(f)

# read frames
with open(frames_code) as f:
    codes = json.load(f)

In [4]:
def get_frame_name(code, remove_suffix=False):
    code_str = str(code)
    
    # Special handling for code 6.2 due to a typo in the codes dictionary
    if code_str == "6.2":
        if remove_suffix:
            return "Policy Prescription and Evaluation"
        else:
            return "Policy Presecription and Evaluation primary"
    
    # For codes without a dot
    if "." not in code_str:
        if remove_suffix:
            return codes.get(code_str, "").replace(" primary", "").replace(" primany", "")
        else:
            return codes.get(code_str + ".2", "")
    
    # For codes with a dot
    if remove_suffix:
        return codes.get(code_str, "").replace(" primary", "").replace(" primany", "")
    else:
        return codes.get(code_str, "")

def get_tone_name(code):
    if code == "17":
        return "Pro"
    elif code == "18":
        return "Neutral"
    elif code == "19":
        return "Anti"
    else:
        return codes[code]

In [5]:
# generate statistics count
print("immigration data count: ", len(immigration))
print("deathpenalty data count: ", len(deathpenalty))
print("guncontrol data count: ", len(guncontrol))
print("samesex data count: ", len(samesex))
print("tobacco data count: ", len(tobacco))

immigration data count:  6757
deathpenalty data count:  6398
guncontrol data count:  6689
samesex data count:  10583
tobacco data count:  5274


In [6]:
def process_data(json_file):
    # Load JSON data from a file
    with open(json_file, "r") as file:
        data = json.load(file)

    # Define the frame names as columns
    frame_columns = [
        "Capacity and Resources",
        "Crime and Punishment",
        "Cultural Identity",
        "Economic",
        "External Regulation and Reputation",
        "Fairness and Equality",
        "Health and Safety",
        "Legality, Constitutionality, Jurisdiction",
        "Morality",
        "Other",
        "Policy Prescription and Evaluation",
        "Political",
        "Public Sentiment",
        "Quality of Life",
        "Security and Defense",
    ]

    # Initialize an empty list to collect DataFrame rows
    rows = []
    no_frames_counter = 0
    irrelevant_counter = 0

    # Populate the DataFrame
    for key, value in data.items():

        if key == "Immigration1.0-36986":
            print(get_frame_name(
                value["primary_frame"], remove_suffix=True
            ))
        # if there is no primary frame or irrelevant = 1 then skip
        if value["primary_frame"] == None or value["irrelevant"] == 1:
            if [value["irrelevant"] == 1]:
                irrelevant_counter += 1
            if value["primary_frame"] == None:
                no_frames_counter += 1

            continue


        row = {
            "article_id": key,
            "text": value["text"],
            "document_frame": get_frame_name(
                value["primary_frame"], remove_suffix=True
            ),
        }

        # Initialize all frame columns to 0
        for frame in frame_columns:
            row[frame] = 0

        # Set the specific frame to 1 based on the primary_frame
        frame_name = get_frame_name(value["primary_frame"], remove_suffix=True)
        row[frame_name] = 1
        rows.append(row)

    print("No frames: ", no_frames_counter)
    print("Irrelevant: ", irrelevant_counter)
    print("Total correct: ", len(rows))
    print("Total: ", len(data))

    # Create DataFrame from list of rows
    df = pd.DataFrame(
        rows, columns=["article_id", "text", "document_frame"] + frame_columns
    )
    return df

In [7]:
imm_df = process_data(immigration_path)

Policy Prescription and Evaluation
No frames:  824
Irrelevant:  825
Total correct:  5932
Total:  6757


In [8]:
import re

def preprocess_text(text):
    text = text.replace("\n\n", ". ")
    text = text.replace(".. ", ". ")
    #text = text.replace("\t", " ")
    text = text.replace("  ", " ")
    # replace '' with "
    # text = text.replace("''", '"')
    # replace \" with "
    # text = text.replace('\\"', '"')
    text = text.strip()

    # some texts start with "IMM-XXXXX PRIMARY" remove
    text = re.sub(r"^IMM-\d+. PRIMARY. ", "", text)

    # remove leading and trailing whitespaces
    text = text.strip()

    return text

imm_df["text"] = imm_df["text"].apply(preprocess_text)

In [9]:
imm_df.to_json("../../data/mfc/immigration_labeled_preprocessed_unsplitted.json")

In [10]:
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK's punkt tokenizer is downloaded
nltk.download('punkt')

# Prepare tqdm for pandas
tqdm.pandas(desc="Tokenizing sentences")

def expand_row(row):
    # This function takes a row, tokenizes the 'text' field, and returns a DataFrame of the new rows
    sentences = sent_tokenize(row['text'])
    # Create a new DataFrame for each sentence with the same data as the original row
    return pd.DataFrame({
        'article_id': [row['article_id']] * len(sentences),
        'text': sentences,
        'document_frame': [row['document_frame']] * len(sentences),
        **{col: [row[col]] * len(sentences) for col in row.index if col not in ['article_id', 'text', 'document_frame']}
    })

def split_sentences_in_df(df):
    # Apply the expand_row function to each row and collect all resulting DataFrames into a list
    list_of_dataframes = df.progress_apply(expand_row, axis=1)
    # Concatenate all DataFrames from the list into a single DataFrame
    new_df = pd.concat(list_of_dataframes.tolist(), ignore_index=True)
    return new_df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
df = split_sentences_in_df(imm_df)

Tokenizing sentences:   0%|          | 0/5932 [00:00<?, ?it/s]

In [12]:
# save df to ../data/mfc/immigration_labeled_preprocessed.json
df.to_json("../../data/mfc/immigration_labeled_preprocessed.json")

#### Create Train Test Split

In [13]:
# create train and test dataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df["text"].tolist(), test_size=0.2, random_state=42)

In [14]:
from pathlib import Path

def save_articles_to_file(articles, file_path):
    try:
        # Ensure the directory exists
        file_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(file_path, "w") as file:
            for article in articles:
                # Ensure article is only one line
                cleaned_article = article.replace("\n", " ")
                file.write(cleaned_article + "\n")
    except Exception as e:
        print(f"An error occurred while writing to {file_path}: {e}")

# Paths to the output files
train_file_path = Path("../../data/mfc/train_data.txt")
test_file_path = Path("../../data/mfc/test_data.txt")

In [15]:
# Save articles to files
save_articles_to_file(train_df, train_file_path)
save_articles_to_file(test_df, test_file_path)

### Unlabeled

In [16]:
unlabeled_immigration_path = "../../data/mfc/immigration_unlabeled.json"

In [17]:
def process_unlabeled_data(json_file):
    # Load JSON data from a file
    with open(json_file, "r") as file:
        data = json.load(file)

    # Initialize an empty list to collect DataFrame rows
    rows = []

    # Populate the DataFrame
    for item in data:
        key = item["text"].split("\n\n")[0]
        text = item["text"]

        row = {
            "article_id": key,
            "text": text,
        }
        rows.append(row)

    print("Total correct: ", len(rows))
    print("Total: ", len(data))

    # Create DataFrame from list of rows
    df = pd.DataFrame(
        rows, columns=["article_id", "text"]
    )
    return df

In [18]:
imm_unlabeled_df = process_unlabeled_data(unlabeled_immigration_path)

Total correct:  41966
Total:  41966


In [19]:
imm_unlabeled_df["text"] = imm_unlabeled_df["text"].apply(preprocess_text)

In [20]:
imm_unlabeled_df.to_json("../../data/mfc/immigration_unlabeled_preprocessed_unsplitted.json")

In [21]:
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK's punkt tokenizer is downloaded
nltk.download('punkt')

# Prepare tqdm for pandas
tqdm.pandas(desc="Tokenizing sentences")

def expand_row(row):
    # This function takes a row, tokenizes the 'text' field, and returns a DataFrame of the new rows
    sentences = sent_tokenize(row['text'])
    # Create a new DataFrame for each sentence with the same data as the original row
    return pd.DataFrame({
        'article_id': [row['article_id']] * len(sentences),
        'text': sentences
    })

def split_sentences_in_df_unlabeled(df):
    # Apply the expand_row function to each row and collect all resulting DataFrames into a list
    list_of_dataframes = df.progress_apply(expand_row, axis=1)
    # Concatenate all DataFrames from the list into a single DataFrame
    new_df = pd.concat(list_of_dataframes.tolist(), ignore_index=True)
    return new_df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
df = split_sentences_in_df_unlabeled(imm_unlabeled_df)

Tokenizing sentences:   0%|          | 0/41966 [00:00<?, ?it/s]

In [23]:
# save df to ../data/mfc/immigration_labeled_preprocessed.json
df.to_json("../../data/mfc/immigration_unlabeled_preprocessed.json")

In [40]:
# Calculate the number of rows per batch
num_batches = 5
batch_size = len(df) // num_batches

# Save each batch to a separate JSON file
for i in range(num_batches):
    start_idx = i * batch_size
    if i == num_batches - 1:  # Handle the last batch, which may have more rows due to integer division
        end_idx = len(df)
    else:
        end_idx = (i + 1) * batch_size
    
    batch_df = df.iloc[start_idx:end_idx]
    batch_df.to_json(f"../../data/mfc/immigration_unlabeled_preprocessed_batch_{i+1}.json")

print("Completed saving all batches.")

Completed saving all batches.


## SemEval 2023 Dataset

In [24]:
import pandas as pd
import os

In [25]:
# File paths
labels_file = '../../data/semeval/data/en/train-labels-subtask-2.txt'
articles_dir = '../../data/semeval/data/en/train-articles-subtask-2'
type_file = '../../data/semeval/data/en/train-labels-subtask-1.txt'

In [26]:
def parse_labels(labels_file):
    """
    Parse the labels file to create a dictionary mapping article id to frames.
    """
    labels_dict = {}
    with open(labels_file, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            article_id = parts[0]
            frames = parts[1].split(',')
            labels_dict[article_id] = frames
    return labels_dict

def parse_types(types_file):
    """
    Parse the types file to create a dictionary mapping article id to type.
    """
    types_dict = {}
    with open(types_file, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            article_id = parts[0]
            article_type = parts[1]
            types_dict[article_id] = article_type
    return types_dict

def parse_articles(articles_dir, labels_dict, types_dict):
    """
    Parse the articles directory and create a list of dictionaries with article id, text, frames, and type.
    """
    data = []
    for article_file in os.listdir(articles_dir):
        if article_file.startswith('article') and article_file.endswith('.txt'):
            article_id = article_file[len('article'):-len('.txt')]
            if article_id in labels_dict and article_id in types_dict:
                with open(os.path.join(articles_dir, article_file), 'r') as file:
                    article_text = file.read().strip()
                    frames = labels_dict[article_id]
                    article_type = types_dict[article_id]
                    data.append({
                        'article_id': article_id,
                        'text': article_text,
                        'frames': frames,
                        'type': article_type
                    })
    return data

def create_df(labels_file, types_file, articles_dir):
    """
    Create a dataset from the labels file and articles directory.
    """
    labels_dict = parse_labels(labels_file)
    types_dict = parse_types(types_file)
    data = parse_articles(articles_dir, labels_dict, types_dict)
    return pd.DataFrame(data)

In [27]:
# Create the dataset
df = create_df(labels_file, type_file, articles_dir)

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

def apply_one_hot_encoding(df):
    """
    Apply one-hot encoding to the frames column.
    """
    mlb = MultiLabelBinarizer()
    frames_encoded = pd.DataFrame(mlb.fit_transform(df['frames']), columns=mlb.classes_, index=df.index)
    df = df.join(frames_encoded)
    df.drop(columns=['frames'], inplace=True)
    return df

In [29]:
df = apply_one_hot_encoding(df)

In [30]:
from nltk.tokenize import sent_tokenize

def split_sentences(df):
    """
    Split the article_text into sentences and expand into multiple rows.
    """
    expanded_data = []
    for _, row in df.iterrows():
        # Replace \n\n with a period and a space to mark sentence boundaries
        text = row['text'].replace('\n\n', '. ')
        sentences = sent_tokenize(text)
        for sentence in sentences:
            expanded_data.append({
                'article_id': row['article_id'],
                'text': sentence.strip(),
                'type': row['type'],
                **{col: row[col] for col in df.columns if col not in ['text', 'type', 'article_id']}
            })
    return pd.DataFrame(expanded_data)

In [31]:
df = split_sentences(df)

In [32]:
df.iloc[0]

article_id                                                                              111111111
text                                            Next plague outbreak in Madagascar could be 's...
type                                                                                      opinion
Capacity_and_resources                                                                          0
Crime_and_punishment                                                                            0
Cultural_identity                                                                               0
Economic                                                                                        0
External_regulation_and_reputation                                                              0
Fairness_and_equality                                                                           0
Health_and_safety                                                                               1
Legality_Constitutio

In [33]:
df.iloc[0]["text"]

"Next plague outbreak in Madagascar could be 'stronger': WHO."

In [34]:
df.to_json("../../data/semeval/muse-dlf/semeval_train.json")

In [35]:
# create train and test dataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df["text"].tolist(), test_size=0.2, random_state=42)

In [36]:
from pathlib import Path

def save_articles_to_file(articles, file_path):
    try:
        # Ensure the directory exists
        file_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(file_path, "w") as file:
            for article in articles:
                # Ensure article is only one line
                cleaned_article = article.replace("\n", " ")
                file.write(cleaned_article + "\n")
    except Exception as e:
        print(f"An error occurred while writing to {file_path}: {e}")

# Paths to the output files
train_file_path = Path("../../data/semeval/muse-dlf/train_data.txt")
test_file_path = Path("../../data/semeval/muse-dlf/test_data.txt")

In [37]:
# Save articles to files
save_articles_to_file(train_df, train_file_path)
save_articles_to_file(test_df, test_file_path)

### Unlabeled

In [38]:
import os
import pandas as pd
from nltk.tokenize import sent_tokenize

# Define the file paths
articles_dir = '../../data/semeval/data/en/test-articles-subtask-2'

def parse_unlabeled_articles(articles_dir):
    """
    Parse the articles directory and create a list of dictionaries with article id and text.
    """
    data = []
    for article_file in os.listdir(articles_dir):
        if article_file.startswith('article') and article_file.endswith('.txt'):
            article_id = article_file[len('article'):-len('.txt')]
            with open(os.path.join(articles_dir, article_file), 'r') as file:
                article_text = file.read().strip()
                data.append({
                    'article_id': article_id,
                    'text': article_text
                })
    return data

def create_unlabeled_df(articles_dir):
    """
    Create a dataset from the articles directory.
    """
    data = parse_unlabeled_articles(articles_dir)
    return pd.DataFrame(data)

# Create the dataset
unlabeled_df = create_unlabeled_df(articles_dir)

def split_sentences(df):
    """
    Split the article text into sentences and expand into multiple rows.
    """
    expanded_data = []
    for _, row in df.iterrows():
        # Replace \n\n with a period and a space to mark sentence boundaries
        text = row['text'].replace('\n\n', '. ')
        sentences = sent_tokenize(text)
        for sentence in sentences:
            expanded_data.append({
                'article_id': row['article_id'],
                'text': sentence.strip()
            })
    return pd.DataFrame(expanded_data)

# Split sentences in the dataset
unlabeled_df = split_sentences(unlabeled_df)

# Save the DataFrame to a JSON file
unlabeled_df.to_json("../../data/semeval/muse-dlf/semeval_unlabeled_train.json")

# Print the first row of the DataFrame
print(unlabeled_df.iloc[0])
print(unlabeled_df.iloc[0]["text"])


article_id                                                  311
text          Journalist names obstacle to peace between Ukr...
Name: 0, dtype: object
Journalist names obstacle to peace between Ukraine and Russia
The Ukrainian leader is “dangerous” as he’s ready to sacrifice his people to stay in power, Angelo D’Orsi claims .


In [39]:
len(unlabeled_df.groupby("article_id").count())

54