In [86]:
import pandas as pd
import numpy as np
import csv

In [49]:
articledf = pd.read_json("../novetta_files/GNI88-json.json")

quotedf = pd.read_csv("../novetta_files/quote_data/GNI88.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


### Clean full texts

##### Regex patterns to remove

In [50]:
#Patterns at start of article
allarticle_header_regex = "Media: .*\nAuthor: (?:.*\n){1,10}Date: .*\n\n"
allarticle_header_regex_byline = "Media: .*\r\nByline: (?:.*\r\n){1,10}Date: .*\r\n\r\n"
politico_share_regex = '.*\n{1,20}Follow Us\n'
politico_date_regex = '^.*\nBy.*\n\d\d/\d\d/\d\d\d\d \d\d:\d\d (?:AM|PM) EDT'

#Patterns at end of article
dow_regex = 'License this article from Dow Jones Reprint Service'

#Patterns in article
#search for line with only all caps and punctuation
fox_bold_regex = "\n[A-Z ',.-]+\n"

##### Find regex patterns and remove them

In [54]:
remove_patterns = [allarticle_header_regex, allarticle_header_regex_byline, dow_regex, politico_date_regex]

articledf['Body'] = articledf['Body'].replace(to_replace="|".join(remove_patterns), value="", regex=True)


#Remove ALL CAPS lines in Fox news articles
articledf['Body'] = np.where(articledf['Media Name'] == 'Fox News', 
                           articledf['Body'].replace(to_replace=fox_bold_regex, value="", regex=True),
                           articledf['Body'])

#Remove irrelevant lines at start of Politico "playbook" articles
articledf['Body'] = np.where(articledf['Media Name'] == 'Politico', 
                           articledf['Body'].replace(to_replace=politico_share_regex, value="", regex=True),
                           articledf['Body'])             

### Map full text to labelled data

In [81]:
# Creating a dictionary of article ids matched with their respective full texts
id_article_dict = pd.Series(articledf.Body.values,index=articledf["Article ID"]).to_dict()

# Mapping the id-to-fulltext dictionary to create a Full Text column in the labelled df
quotedf['fulltext']= quotedf['Article ID'].map(id_article_dict)

### Remove irrelevant articles

In [82]:
#Remove all Spanish articles
spanish_outlets = ['El Nuevo Herald', 'El Diario La Prensa', 'Univision', 'AP Spanish Worldstream']
quotedf = quotedf[-quotedf["Media Name"].isin(spanish_outlets)]

#Filter out Bloomberg show transcripts
quotedf = quotedf[-((quotedf["Media Name"]=='Bloomberg') & (quotedf.Headline.str.contains('Full Show')))]

### Clean Source Name column

In [83]:
# Drop "OLD" labels from name strings
quotedf["source_name_cleaned"] = quotedf["Source Name"].str.split(" OLD", expand =True)[0]

#Drop anything in parentheses
quotedf["source_name_cleaned"] = quotedf["source_name_cleaned"].str.split(r" \(.*\)", expand =True)[0]

#Strip trailing commas, and leading and trailing whitespace, then check for trailing commas again
quotedf["source_name_cleaned"] = quotedf["source_name_cleaned"].str.rstrip(",").str.strip().str.rstrip(",")#.str.strip("\n")

### Export to csv

In [87]:
quotedf.to_csv("GNI88_cleaned_data.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)