In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import io
import re
import glob
import dask.dataframe as dd
import nltk

# access dataset from xml file

Read XML files that are provided from SemEval. Put them into dataframes.

In [2]:
test_df = pd.DataFrame()
train_df = pd.DataFrame()
byarticle_df = pd.DataFrame()

In [3]:
test_path = 'data/articles-validation-bypublisher-20181122.xml'
train_path = 'data/articles-training-bypublisher-20181122.xml'
test_path_truth = 'data/ground-truth-validation-bypublisher-20181122.xml'
train_path_truth = 'data/ground-truth-training-bypublisher-20181122.xml'

byarticle_path = 'data/articles-training-byarticle-20181122.xml'
byarticle_truth = 'data/ground-truth-training-byarticle-20181122.xml'

In [4]:
def etparse(df, path_articles, path_truth, bias_val = True):
    
    tree_art = ET.parse(path_articles)
    tree_truth = ET.parse(path_truth)
    root_art = tree_art.getroot()
    root_truth = tree_truth.getroot()
    
    texts = []
    titles = []
    label = []
    bias = []
    
    for article in root_art:
        texts.append(''.join(article.itertext()))
        titles.append(article.attrib['title'])
        
    for article in root_truth:
        label.append(article.attrib['hyperpartisan'])
        if bias_val:
            bias.append(article.attrib['bias'])
        
    assert len(texts) == len(titles) == len(label)
    if bias_val:
        assert len(texts) == len(bias)
    
    df['title'] = titles
    df['text'] = texts
    df['label'] = label
    if bias_val:
        df['bias'] = bias
    
    return df

In [5]:
test_df = etparse(test_df, test_path, test_path_truth)

In [5]:
train_df = etparse(train_df, train_path, train_path_truth)

In [23]:
byarticle_df = etparse(byarticle_df, byarticle_path, byarticle_truth, False)

# cleaning

Initial cleaning: special characters, certain recurring error patterns, leftover html.

In [6]:
def clean_text(text):
    text = text.replace('&#160;', ' ')
    text = text.replace("\n", '')
    text = text.replace("\r", '')
    text = text.replace("  ", ' ')
    text = text.replace(" ? ", " ")
    text = text.replace('.......... ', '')
    text = text.replace('&amp;', '')
    text = text.replace('[…] ', '')
    text = text.replace('[...] ', '')
    text = text.replace(' |', '')
    text = text.replace('Continue Reading Below ', '')
    
    text = re.sub(r"(?<=\w)[/?](?=\w)", "'", text) # any ? preceded by & followed immediately by a letter. Here?s group?s It?s
    text = re.sub(r"\bNone\b[ ](?=[A-Z])", "", text) # blah blah. None In the end -> leftovers from XML file
    text = re.sub(r"[/?][^ ][^/?]*[^ ][/?]", lambda x: x.group()[1:-1], text) # ?gun control?
    text = re.sub(r"&lt;.*&gt;", "", text) # remove img/html tags
    text = re.sub(r'(^|[^@\w])@(\w{1,15})\b', '\\1@TWITTER', text) # replace all twitter UN with @TWITTER
    
    text = text.replace('&gt;', '')
    text = text.replace('&lt;', '')
    
    return text

In [7]:
test_df['text'] = test_df['text'].apply(clean_text)

In [7]:
train_df['text'] = train_df['text'].apply(clean_text)

In [26]:
byarticle_df['text'] = byarticle_df['text'].apply(clean_text)

# further cleaning - remove certain symbols/names

More complicated cleaning: phrases discovered in texts that could be publisher-specific, more special characters.

In [8]:
from nltk.tokenize import sent_tokenize

In [9]:
def further_clean(text):
    
    sents = sent_tokenize(text)
    
    remove = [' ©', '© ', 'Opens a New Window', 'click here', 'follow us', 'https:', '.html', 'read full article',
             'a href', 'originally published', 'image source: ']
    
    new_text = []
    
    email = re.compile(r"[^@ ]+@[^@]+")
    
    for item in sents:
        add = True
        for term in remove:
            if term in item:
                add = False
        if '©' == item[0]:
            add = False
        if 'photo by' in item.lower() and len(item) < 50:
            add = False
        if 'read more' in item.lower() and len(item) < 50:
            add = False
        if 'featured image' in item.lower() and len(item) < 100:
            add = False
        if item[0] == '[' and item[-1] == ']':
            if len(item) < 50:
                add = False
            if '.' not in item:
                add = False
            if add:
                item = item[1:-1]
        if item[0] == '(' and item[-1] == ')' and '.' not in item and '!' not in item and '?' not in item:
            add = False
        if email.match(item):
            add = False
        if len(email.findall(item)) > 0 and '@TWITTER' not in item:
            add = False
            
        if ') —' in item[:50]:
            item = item[item.find('—')+2:]
        
        if add:
            new_text.append(item)
                
    return ' '.join(new_text)

In [10]:
test_df['text'] = test_df['text'].apply(further_clean)

In [10]:
train_df['text'] = train_df['text'].apply(further_clean)

In [31]:
byarticle_df['text'] = byarticle_df['text'].apply(further_clean)

# Handle NaN

Some articles came without titles or even texts (presumably by mistake). Any empty texts here are replaced with the empty string instead of NaN to prevent problems later.

In [11]:
def check_str(text):
    if type(text) != str:
        print(text)
        return ''
    else:
        return text

In [12]:
test_df['text'] = test_df['text'].apply(check_str)

In [12]:
train_df['text'] = train_df['text'].apply(check_str)

In [38]:
byarticle_df['text'] = byarticle_df['text'].apply(check_str)

# save

Save pandas dataframes as csv for later.

In [13]:
test_df.to_csv('data/test_df.csv', index=False)

In [13]:
train_df.to_csv('data/train_df.csv', index=False)

In [40]:
byarticle_df.to_csv('data/byarticle_df.csv', index=False)

In [14]:
train_df

Unnamed: 0,title,text,label,bias
0,After DeVos Announced Plans To Reexamine Title...,When explaining her decision to reevaluate Tit...,true,right
1,University To Award Trayvon Martin With Posthu...,A Florida university will honor Trayvon Martin...,true,right
2,Texas State University suspends Greek life aft...,Texas State University has suspended all Greek...,false,right-center
3,Jewish Organization's Huge Day Of Unity On Tue...,Against the backdrop of an increasingly polari...,true,right
4,"BREAKING: Trump Reaches Agreement To Keep 1,00...",President-elect Donald Trump has reached an ag...,true,right
5,Winning numbers drawn in ‘2 By 2’ game,"LINCOLN, Neb. (AP) _ The winning numbers in Tu...",false,least
6,Health insurance stocks slump in face of poten...,The Centers for Medicare and Medicaid Services...,false,least
7,ANALYSIS: Does gay debate mirror church disput...,One group of Christians confidently proclaims ...,false,left-center
8,How schools in Brazil are teaching kids to eat...,"On a hilly slope in São Paulo City, a group of...",false,left-center
9,Massive venomous snake found under New Orleans...,Wildlife removal experts discovered a large ve...,false,right-center
