In [None]:
# -*- coding: utf-8 -*-
"""rethink_media_cleaning.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1cs7vmSMoleaziY5APBCM0lcKsW9VQsJQ
"""

import numpy as np
import pandas as pd
import re


#Accessing Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/ReThink Media Project/full_text_data_cleaning_result.csv")

#### Regex patterns

In [None]:
#looks for phone number and optional leading spaces/punctuation
phonenum_regex = '((?: |, |; |\. |\| )?\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}(?: |, |; |\. |\| )?)'

#looks for email address and optional leading spaces/punctuation
email_regex = "((?: |, |; |\. |\| )?[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+(?: |, |; |\. |\| )?)"

#looks for title words (case insensitive) and optional leading spaces/punctuation
title_regex = '((?: |, |; |\. |\| | - )?(?i)(?:Staff Writers?|Editor\-in\-Chief|Managing Editor|Political Editor|Editor\-at\-large|Columnist|Correspondent|Opinion contributors?|special.*|Capital Bureau)(?: |, |; |\. |\| )?)'

#capture -, anything after | 
symbol_regex = ' -|\|.*$'

#capture firstname.lastinitial pattern at end of AJC bylines, "; .. is . ." pattern with bios 
specialpatterns_regex = "(?: \w{4,}\.\w$)|(?i); .*(?:\.$| is.*)"

#capture non-name entries including anything after 'from,' and anything containing 'editorial', 'readers', or 'editors' 
non_name_regex = ".*(?:staff$|staff ).*|Letters to the Editor|from.*|(?i).*editorial.*|(?i).*editors.*|No by-line,|(?i).*readers.*"

#look for news outlets, case insensitive, including optional leading 'the'/connectors/punctuation
#For CNN captures anything that comes after
outlet_regex = '(?i)(?:, |; | and | for | ?The )?(?i)(?:CNN.*$|Associated Press|New York Times|Washington Times|USA Today|AJC|Green Bay Press-Gazette|Daily Beast|Nation|Houston Chronicle|Sarasota Herald-Tribune|Augusta Chronicle|Arizona Republic|Texas Tribune)'

#capture non-comma connectors ('and', ';and', ';', '\n')
connector_regex = '((?i)(?: ;and | and |; *|\\n *))'

#capture double comma patterns
dbl_comma_regex = ', *,+ *'


####  Prefixes and Suffixes to remove

In [None]:
test_strings = ['Mark', 'By Mark', 'No by-line', 'Opinion by Mark', 'Analysis by Mark']

#for test in df.head()['author']:
#  print(author_cleaning(test))

prefixes = ['letter to the editor by ', 'by ', 'opinion by ', 'analysis by ', 'compiled by ']

suffixes = [';Editor', ' Florida Times-Union', ' Jacksonville Florida Times-Union', ' Milwaukee Journal Sentinel',
            ' Capitol Media Services', ' -- Times Staff Writer', 'Appleton Post-Crescent',  
            '; Richmond Times-Dispatch', ' SUN STAFF WRITER',  
            '; The (Charlottesville) Daily Progress',', The Lexington Herald-Leader', ' News Service Of Florida',
            ', Palm Beach Post', '; Editor', '; WPR NEWS', 
            ' Richmond Times-Dispatch', ' -- Times/Herald Tallahassee Bureau', ', RealClearWire', 
            '  -- Times Political Editor', '; Austin Bureau', ' Tribune News Service', ' Guest Columnist', 
            '; LA CROSSE TRIBUNE', ', Omaha World-Herald', ' USA TODAY NETWORK',  
            ' InsideSources.com', ' Yuma Sun Editor', ', Capitol Beat News Service', ' South Florida Sun Sentinel',
            ' Orlando Sentinel', '; Murphy teaches writing at Virginia Tech', " Washington Bureau",  
            'The Roanoke Times', '; Contributing Writer', '  -- Times/Herald',  
            ' Capitol Beat News Service', ' -- PolitiFact', '; Now News Group', ' Tribune Content Agency', 
            '; WISCONSIN STATE JOURNAL', '; Washington Bureau Chief', ' The Heritage Foundation',
            ', Associated Press; The New York Times contributed.', ', Los Angeles Times', ' Atlanta Journal-Constitution', 
            ' of Capital News Service']

def remove_prefix(text):
    for prefix in prefixes:
      if text.lower().startswith(prefix):
        slicer = len(prefix)
        return text[slicer:]
    return text

def remove_suffix(text):
    for suffix in suffixes:
      if text.endswith(suffix):
        slicer = len(suffix)
        return text[:-slicer]
    return text

len(suffixes)

df["cleaned_author"] = df["author"].replace(np.nan, "none").apply(remove_prefix).apply(remove_suffix).str.title()

partially_cleaned = pd.DataFrame(df['cleaned_author'].unique())
#partially_cleaned

#df.to_csv(path_or_buf='/content/drive/MyDrive/cleaned_author_names.csv', index = False)



In [None]:
def regex_trim(rx_list,column, df=df, replace_value=""):
    '''Takes a list of regex patterns, and joins the patterns with an OR (|) separator. 
    Searches the specified column/df for the pattern and replaces it with value specified (default value-nothing)'''
    df[column] = df[column].replace(to_replace="|".join(rx_list), value=replace_value, regex=True)
    return df

In [None]:
rx_patterns = [phonenum_regex, 
               email_regex, 
               title_regex, 
               symbol_regex, 
               specialpatterns_regex, 
               outlet_regex, 
               non_name_regex]

new_df = regex_trim(rx_patterns, "cleaned_author")

#find non-comma connectors and convert to comma
new_df = regex_trim([connector_regex], "cleaned_author", df=new_df, replace_value=", ")

#after comma conversion, check for multiple commas together and convert to single comma
new_df = regex_trim([dbl_comma_regex], "cleaned_author", df=new_df, replace_value=", ")

#strip trailing commas, and leading and trailing whitespace, then check for trailing commas again
new_df['cleaned_author'] = new_df['cleaned_author'].str.rstrip(",").str.strip().str.rstrip(",")

#Print results
#new_df.cleaned_author.str.title().value_counts().head(30)

In [None]:
new_df.shape

(11355, 12)

In [None]:
spanish_outlets = ['Univision Noticias', 'El Diario La Prensa', 'El Nuevo Herald', 'AP Spanish']

#for outlet in spanish_outlets:
new_df = new_df[~new_df['publisher'].isin(spanish_outlets)]
new_df.shape

(11352, 12)

In [None]:
new_df.cleaned_author.str.title().value_counts().to_csv("/content/drive/MyDrive/ReThink Media Project/Journo Name List to Check.csv", index = False)
new_df.to_csv(path_or_buf='/content/drive/MyDrive/ReThink Media Project/Full text w cleaned journo names.csv', index = False)

In [None]:
new_df.head()

Unnamed: 0,id,Irrelevant,publisher,subject,Duplicate,author,body,artdate,Month,Year,"Notes: Green under Body (Good), Yellow under Subject (Duplicate), Red under Publisher (Spanish), Red under Subject (Some type of summary article of headlines/recent news)",cleaned_author
0,19837759,,Fox News,Judge rejects Trump campaign lawsuit attemptin...,,Paul Best,U.S. District Judge Dana Christensen rejected ...,2020-10-01,10,2020,,Paul Best
1,19837762,,Fox News,Michigan mail-in voting: what to know,,Morgan Phillips,"As the coronavirus pandemic rages on, a number...",2020-10-01,10,2020,,Morgan Phillips
2,19845892,,CNN,Fact check: Almost every single one of Trump's...,,"By Daniel Dale and Marshall Cohen, CNN",President Donald Trumplied about a wide variet...,2020-10-01,10,2020,,"Daniel Dale, Marshall Cohen"
3,19845895,,Politico,Trump’s Proud Boy moment sparks Black outrage ...,,Matt Dixon,Florida Playbook\nGary Fineout and Matt Dixon'...,2020-10-01,10,2020,,Matt Dixon
4,19846567,,HuffPost,Facebook Bans Ads That Seek To ‘Delegitimize’ ...,,Sarah Ruiz-Grossman,Facebook announced a new policy on Wednesday b...,2020-10-01,10,2020,,Sarah Ruiz-Grossman


#### Code for checking results

In [None]:
#Check cleaned names
#new_df.cleaned_author[(new_df.cleaned_author.fillna("").str.lower().str.contains(" none "))].value_counts()

In [None]:
#Check original names
#df.author.str.title()[(df.author.fillna("").str.lower().str.contains())].value_counts().head(60)

In [None]:
new_df['publisher'].value_counts()

Associated Press Newswires    1140
CNN                            807
The Hill                       768
The New York Times             641
Fox News                       543
                              ... 
St.Paul Pioneer Press            1
Daily Herald                     1
Austin American-Statesman        1
NBC News                         1
The Philadelphia Inquirer        1
Name: publisher, Length: 120, dtype: int64

Unnamed: 0,id,Irrelevant,publisher,subject,Duplicate,author,body,artdate,Month,Year,"Notes: Green under Body (Good), Yellow under Subject (Duplicate), Red under Publisher (Spanish), Red under Subject (Some type of summary article of headlines/recent news)",cleaned_author
0,19837759,,Fox News,Judge rejects Trump campaign lawsuit attemptin...,,Paul Best,U.S. District Judge Dana Christensen rejected ...,2020-10-01,10,2020,,Paul Best
1,19837762,,Fox News,Michigan mail-in voting: what to know,,Morgan Phillips,"As the coronavirus pandemic rages on, a number...",2020-10-01,10,2020,,Morgan Phillips
2,19845892,,CNN,Fact check: Almost every single one of Trump's...,,"By Daniel Dale and Marshall Cohen, CNN",President Donald Trumplied about a wide variet...,2020-10-01,10,2020,,"Daniel Dale, Marshall Cohen"
3,19845895,,Politico,Trump’s Proud Boy moment sparks Black outrage ...,,Matt Dixon,Florida Playbook\nGary Fineout and Matt Dixon'...,2020-10-01,10,2020,,Matt Dixon
4,19846567,,HuffPost,Facebook Bans Ads That Seek To ‘Delegitimize’ ...,,Sarah Ruiz-Grossman,Facebook announced a new policy on Wednesday b...,2020-10-01,10,2020,,Sarah Ruiz-Grossman
...,...,...,...,...,...,...,...,...,...,...,...,...
11350,50053442,,The Root,New York Bar Association Considers Revoking Ru...,,,Former New York City mayor and current Trump s...,2021-01-11,1,2021,,
11351,50053548,,The Root,Trump Supporters Are Blaming Antifa for the Ca...,,,I’ll say it again: THE ONLY REASON TRUMP SUPPO...,2021-01-08,1,2021,,
11352,50053549,,The Root,"For Black People, Wednesday Was Just Another D...",,Michael Harriot,There must be a certain stomach-churning const...,2021-01-07,1,2021,,Michael Harriot
11353,50053550,,The Root,"After Siege of the Capitol by Pro-Trump Mob, C...",,Ishena Robinson,After a whirlwind day that saw a mob of Trump ...,2021-01-07,1,2021,,Ishena Robinson
