In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
#Accessing Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_excel("/content/drive/MyDrive/ReThink Media Project/Marked full text articles_oct_nov.xlsx").append(pd.read_excel("/content/drive/MyDrive/ReThink Media Project/Marked full text articles_dec_jan.xlsx"))

#### Remove irrelevant articles

In [None]:
#Remove 1396 hand-marked irrelevant articles (including all Spanish articles)
df = df[df.Irrelevant.isnull()]

#Filter out Bloomberg show transcripts
df = df[-((df.publisher=='Bloomberg') & (df.subject.str.contains('Full Show')))]

#### Remove irrelevant text from articles

In [None]:
#Patterns at start of article
allarticle_header_regex = "Media: .*\nAuthor: (?:.*\n){1,10}Date: .*\n\n"
politico_share_regex = '.*\n{1,20}Follow Us\n'
politico_date_regex = '^.*\nBy.*\n\d\d/\d\d/\d\d\d\d \d\d:\d\d (?:AM|PM) EDT'

#Patterns at end of article
dow_regex = 'License this article from Dow Jones Reprint Service'

#Patterns in article
#search for line with only all caps and punctuation
fox_bold_regex = "\n[A-Z ',.-]+\n"

In [None]:
def regex_trim(rx_list,column, df=df, replace_value=""):
    '''Takes a list of regex patterns, and joins the patterns with an OR (|) separator. 
    Searches the specified column/df for the pattern and replaces it with value specified (default value-nothing)'''
    df[column] = df[column].replace(to_replace="|".join(rx_list), value=replace_value, regex=True)
    return df


In [None]:
remove_patterns = [allarticle_header_regex, 
                   dow_regex, 
                   politico_date_regex]

df = regex_trim(remove_patterns, "body")

#Remove ALL CAPS lines in Fox news articles
df["body"] = np.where(df.publisher=='Fox News', 
                           df.body.replace(to_replace=fox_bold_regex, value="", regex=True),
                           df.body)

#Remove irrelevant lines at start of Politico "playbook" articles
df["body"] = np.where(df.publisher=='Politico', 
                           df.body.replace(to_replace=politico_share_regex, value="", regex=True),
                           df.body)


#### Drop duplicates

In [None]:
#Drop articles that have the same headline and date
df = df.drop_duplicates(["subject","artdate"])

#Drop articles that have the same headline if they were manually marked as a Duplicate in the spreadsheet
df = df[-(((df.duplicated(subset=["subject"])==True) & (df.Duplicate == "Duplicate")))]

In [None]:
df.head()

Unnamed: 0,id,Irrelevant,publisher,subject,Duplicate,author,body,artdate,Month,Year,"Notes: Green under Body (Good), Yellow under Subject (Duplicate), Red under Publisher (Spanish), Red under Subject (Some type of summary article of headlines/recent news)"
0,19837759,,Fox News,Judge rejects Trump campaign lawsuit attemptin...,,Paul Best,U.S. District Judge Dana Christensen rejected ...,2020-10-01,10,2020,
1,19837762,,Fox News,Michigan mail-in voting: what to know,,Morgan Phillips,"As the coronavirus pandemic rages on, a number...",2020-10-01,10,2020,
2,19845892,,CNN,Fact check: Almost every single one of Trump's...,,"By Daniel Dale and Marshall Cohen, CNN",President Donald Trumplied about a wide variet...,2020-10-01,10,2020,
3,19845895,,Politico,Trump’s Proud Boy moment sparks Black outrage ...,,Matt Dixon,Florida Playbook\nGary Fineout and Matt Dixon'...,2020-10-01,10,2020,
4,19846567,,HuffPost,Facebook Bans Ads That Seek To ‘Delegitimize’ ...,,Sarah Ruiz-Grossman,Facebook announced a new policy on Wednesday b...,2020-10-01,10,2020,


In [None]:
df.to_csv(path_or_buf='/content/drive/MyDrive/ReThink Media Project/full_text_data_cleaning_result.csv', index = False)

In [None]:
#Code for checking results of regex
#df[df.body_test.str.contains("\n[A-Z ',.-]+\n", regex=True)]

Unnamed: 0,id,Irrelevant,publisher,subject,Duplicate,author,body,artdate,Month,Year,"Notes: Green under Body (Good), Yellow under Subject (Duplicate), Red under Publisher (Spanish), Red under Subject (Some type of summary article of headlines/recent news)",body_test
31,19884512,,The Arizona Republic,Debate has little substance for voters; Lack o...,,By Yvonne Wingett Sanchez and Ronald J. Hansen...,Media: The Arizona Republic\nAuthor: By Yvonne...,2020-10-01,10,2020,,Media: The Arizona Republic\nAuthor: By Yvonne...
122,19978780,,Los Angeles Times,ELECTION 2020; Frustrations at the ballot box;...,,Matt Stiles,Media: Los Angeles Times\nAuthor: Matt Stiles\...,2020-10-02,10,2020,,Media: Los Angeles Times\nAuthor: Matt Stiles\...
125,19979151,,The New York Times,"Get Your Culture, Inside and Out",,,Media: The New York Times\nAuthor: \nDate: 02 ...,2020-10-02,10,2020,,Media: The New York Times\nAuthor: \nDate: 02 ...
147,19987524,,Chicago Tribune,A graduated-rate income tax will be better for...,,Eric Zorn,Media: Chicago Tribune\nAuthor: Eric Zorn\nDat...,2020-10-02,10,2020,,Media: Chicago Tribune\nAuthor: Eric Zorn\nDat...
153,19989462,,The Hill,"The Hill's Morning Report - Trump, first lady ...",,Alexis Simendinger,Media: thehill\nAuthor: Alexis Simendinger\nDa...,2020-10-02,10,2020,,Media: thehill\nAuthor: Alexis Simendinger\nDa...
...,...,...,...,...,...,...,...,...,...,...,...,...
5543,50050493,,The Hill,The Hill's Morning Report - Trump finally conc...,,Alexis Simendinger,Media: thehill\nAuthor: Alexis Simendinger\nDa...,2021-01-08,1,2021,,Media: thehill\nAuthor: Alexis Simendinger\nDa...
5581,50050536,,The Hill,The Hill's Morning Report - Presented by Faceb...,,Alexis Simendinger,Media: thehill\nAuthor: Alexis Simendinger\nDa...,2021-01-06,1,2021,,Media: thehill\nAuthor: Alexis Simendinger\nDa...
5582,50050812,,The Hill,The Hill's Morning Report - Presented by Maste...,,Alexis Simendinger,Media: thehill\nAuthor: Alexis Simendinger\nDa...,2020-12-18,12,2020,,Media: thehill\nAuthor: Alexis Simendinger\nDa...
5583,50050938,,The Hill,The Hill's Morning Report - Presented by Faceb...,,Alexis Simendinger,Media: thehill\nAuthor: Alexis Simendinger\nDa...,2020-12-10,12,2020,,Media: thehill\nAuthor: Alexis Simendinger\nDa...
