# Cleaning text and export training data (Webhose / Google News)

In [2]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import re
import json

Loading map for contractions substitutions

In [3]:
with open('contractions.json', 'r') as contractions_file:
    contractions_dict = json.load(contractions_file)

Loading dataset of articles selected from webhose and google news scrape

In [4]:
dataset = pd.read_csv("../data_processed/selected_articles_webhose_google_news.csv", index_col=0)

Helper to analyse most common words in each news source

In [5]:
# CHECK FOR COMMON WORDS IN EACH NEWS SOURCE TO SPOT ANY REPETITION TO REMOVE FROM ARTICLES
# nltk.download('stopwords')
# stopwords = nltk.corpus.stopwords.words('english')
# def filter_by_domain(dataset, domain):
#     return dataset[dataset["domain"] == domain]

# def frequence_dist_by_domain(dataset, domain):
#     corpus = filter_by_domain(dataset, domain)["content"].sum().lower()
#     tokens = tokenizer.tokenize(corpus)
#     return nltk.FreqDist(w for w in tokens if w not in stopwords)

# for domain in domains:
#     print("===============================")
#     print("DOMAIN {}".format(domain))
#     frequence = frequence_dist_by_domain(dataset, domain)
#     print(frequence.most_common(10))
#     print("===============================")

In [6]:
# Function to expand contractions
def expand_contractions(content):
    for original, replacement in contractions_dict.items():
        content = content.replace(original, replacement)

    return content

In [7]:
# Function with rules for each especific domain
def clean_by_domain(content, domain):
    if domain == "reuters.com":
        content = re.sub(r"^(.*?) - ", "", content)
    
    if domain == "cnn.com":
        content = content.replace("(CNN)", "")
        
    if domain == "foxnews.com":
        content = re.sub(r"\n(.*?) contributed to this report.", "", content)
        content = content.replace("CLICK HERE TO GET THE FOX NEWS APP", "")
    
    if domain == "slate.com":
        content = content.replace("Get The Angle in Your Inbox Slate's daily newsletter rounds up the stories you need to read. We encountered an issue signing you up. Please try again. Please enable javascript to use form. Email address: Thanks for signing up! You can manage your newsletter subscriptions at any time.", "")
    
    if domain == "bbc.com":
        content = content.replace("Image copyright Reuters Image caption", "")
        content = content.replace("Related Topics", "")
    
    return content

In [8]:
# Function to clean a text based on the article entry
def clean_text(row):
    content = row["content"]
    domain = row["domain"]
    # Especific cleaning by domain
    content = clean_by_domain(content, domain)

    # General rules
    content = content.replace("U.S.", "United States")
    content = content.replace("U.S", "United States")   
    content = content.replace("Associated Press", "")
    
    # lowercase
    content = content.lower()
    
    # Expand contractions
    content = expand_contractions(content)
    
    return ' '.join(tokenizer.tokenize(content))

### Clean Text

In [9]:
dataset_cleaned = dataset.copy()
dataset_cleaned["content"] = dataset_cleaned.apply(clean_text, axis=1)

### Dataset with labels (removing articles without bias)

In [10]:
dataset_for_label = dataset_cleaned[~dataset_cleaned["label"].isna()][["content", "label"]]

In [15]:
dataset_for_label.shape

(5331, 2)

In [11]:
dataset_for_label["label"] = dataset_for_label["label"].astype(int)

In [12]:
dataset_for_label.to_csv("../data_processed/dataset_with_bias_label.csv")

### Dataset for bias level

In [13]:
dataset_for_level = dataset_cleaned[["content", "level"]]

In [16]:
dataset_for_level.shape

(6343, 2)

In [14]:
dataset_for_level.to_csv("../data_processed/dataset_with_bias_level.csv")