In [2]:
import numpy as np
import pandas as pd

# Dictionary Extraction

### Loading Full Text Data

In [3]:
# Loading Full Text file with cleaned author names
full_text_df = pd.read_csv("../preliminary_data/Full text w cleaned journo names.csv")

In [4]:
full_text_df.head()

Unnamed: 0,id,publisher,subject,Duplicate,author,body,artdate,Month,Year,cleaned_author
0,19837759,Fox News,Judge rejects Trump campaign lawsuit attemptin...,,Paul Best,U.S. District Judge Dana Christensen rejected ...,10/1/2020,10.0,2020.0,Paul Best
1,19837762,Fox News,Michigan mail-in voting: what to know,,Morgan Phillips,"As the coronavirus pandemic rages on, a number...",10/1/2020,10.0,2020.0,Morgan Phillips
2,19845892,CNN,Fact check: Almost every single one of Trump's...,,"By Daniel Dale and Marshall Cohen, CNN",President Donald Trumplied about a wide variet...,10/1/2020,10.0,2020.0,"Daniel Dale, Marshall Cohen"
3,19845895,Politico,Trump’s Proud Boy moment sparks Black outrage ...,,Matt Dixon,Florida Playbook\nGary Fineout and Matt Dixon'...,10/1/2020,10.0,2020.0,Matt Dixon
4,19846567,HuffPost,Facebook Bans Ads That Seek To ‘Delegitimize’ ...,,Sarah Ruiz-Grossman,Facebook announced a new policy on Wednesday b...,10/1/2020,10.0,2020.0,Sarah Ruiz-Grossman


### Cleaning Full Text Data

In [5]:
# Dropping instances where id value is not a number
full_text_df = full_text_df.drop(full_text_df[full_text_df["id"] == 'rump administration has been holding back roughly half the vaccines to en"'].index)
full_text_df = full_text_df.drop(full_text_df[full_text_df["id"] == 'Eager for more insight'].index)

In [6]:
# Dropping instances where the id value is NaN
full_text_df = full_text_df[full_text_df['id'].notna()]

In [7]:
# Converting id data types from strings to integers
# for compatability with labelled_vr data
full_text_df["id"] = full_text_df["id"].astype("int")

In [8]:
# Creating a dictionary of article ids matched with their respective full texts
id_fulltext_dict = pd.Series(full_text_df.body.values,index=full_text_df.id).to_dict()

In [9]:
# Creating a dictionary of article ids matched with their respective cleaned author names
id_cleanedauthor_dict = pd.Series(full_text_df.cleaned_author.values,index=full_text_df.id).to_dict()

# Dictionary Mapping to Labeled Data

### Loading Labelled Data

In [10]:
# Loading Labelled Data
labelled_df = pd.read_csv("../preliminary_data/Labelled_VR_data_Oct2020_Jan2021.csv")

In [11]:
labelled_df.head()

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,QText,Source Name,Source Type,Source Gender,Constituent Group,Media Name,Journalist Name,Article Issues,Custom Group
0,1/31/21,29484873,Democrats are faced with a choice. Protect the...,57524527,Opinion,Positive,Anti-Voter Policies,Vote Suppression,Positive,"""In addition to the millions and millions of v...",Fred Wertheimer,Non-Profit/NGO,Male,Other,Washington Post.com,By E.J. Dionne Jr.,"VR: Pro-Voter Policies, H.R. 1, VR: Anti-Voter...",Commentary
1,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,57524785,News,Neutral,Anti-Voter Policies,Fraud,Positive,Despite the fact that Republicans know and und...,Debra Bazemore,State/Local Official,Female,,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia
2,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,57524791,News,Neutral,Civic Participation,Partisan Turnout,Neutral,That's the base that is loyal to Donald Trump....,Debra Bazemore,State/Local Official,Female,,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia
3,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,57544697,News,Neutral,Election Administration,Budget,Positive,The party rarely invests in places where there...,Donna Brazile,Partisans/Fmr. Politicians,Female,Other,The New York Times,By Shane Goldmacher,,South Carolina
4,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,57544702,News,Neutral,Election Law & Policy,Electoral System,Neutral,the president and D.N.C. enjoyed “an historica...,Jennifer O'Malley Dillon,Partisans/Fmr. Politicians,Female,,The New York Times,By Shane Goldmacher,,South Carolina


In [12]:
# Quick sanity check: there are 4240 unique values in the Article ID column,
# so we expect 4240 records in the df once we drop Article ID duplicates
labelled_df["Article ID"].value_counts()

21830535    69
21879932    46
22978971    44
27738629    44
21902953    43
            ..
21703471     1
21688367     1
21650612     1
21687860     1
19748196     1
Name: Article ID, Length: 4240, dtype: int64

### Dropping Article ID Duplicates from Labelled Data

In [13]:
# Dropping Article ID duplicates
labelled_df = labelled_df.drop_duplicates(subset='Article ID', keep="last")
labelled_df

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,QText,Source Name,Source Type,Source Gender,Constituent Group,Media Name,Journalist Name,Article Issues,Custom Group
8,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,57544827,News,Neutral,Election Administration,Budget,Positive,Democrats had the opportunity to spend money i...,Donna Brazile,Partisans/Fmr. Politicians,Female,Other,The New York Times,By Shane Goldmacher,,South Carolina
11,1/31/21,29458845,"A call for another Great Migration, this time ...",57544985,News,Neutral,Civic Participation,Voter Registration,Positive,Georgia's expanding African American populatio...,Charles Blow,Media/Journalist,Male,,The Washington Post,Carlos Lozada,,Georgia
14,1/31/21,29474652,Trump Raised $255.4 Million in 8 Weeks as He S...,57522610,Opinion,Neutral,Civic Participation,Overall Turnout,Neutral,Mr. Trump’s strongest fund-raising came in the...,Shane Goldmacher,Media/Journalist,Male,,The New York Times,By Shane Goldmacher and Rachel Shorey,"VR: Anti-Voter Policies, VR: Civic Participati...",Commentary
17,1/31/21,29484873,Democrats are faced with a choice. Protect the...,57545348,Opinion,Positive,Anti-Voter Policies,Voter ID,Positive,The 106 bills the center identified in 28 stat...,Shane Goldmacher,Media/Journalist,Male,,Washington Post.com,By E.J. Dionne Jr.,"VR: Pro-Voter Policies, H.R. 1, VR: Anti-Voter...",Commentary
24,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,57524790,News,Neutral,Anti-Voter Policies,Voter ID,Positive,laws such as prohibiting online voter registra...,Thomas Buser-Clancy,Constituent Group,Male,ACLU,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18986,9/30/20,19775392,Florida might not know on election night wheth...,56064344,News,Neutral,Election Law & Policy,Electoral System,Neutral,"""I think everybody would like to have an answe...",Rick Scott,US Senate & Staff,Male,,The Miami Herald,By David Smiley,,Florida (FL)
18987,9/30/20,19758647,ELECTION 2020; Fact-checking: A stream of fals...,56063497,News,Neutral,Election Protection,Foreign Intervention,Negative,"the ballots could be forged by anyone, includi...",Donald Trump OLD,Federal Official,Male,Other,Los Angeles Times,CHRIS MEGERIAN,,
18988,9/30/20,19741823,Trump Refuses to Condemn White Supremacists: ‘...,56064770,News,Neutral,Anti-Voter Policies,Poll Watchers,Negative,said he wanted them to go to the polls and “wa...,Donald Trump OLD,Federal Official,Male,Other,The Daily Beast,Will Sommer,,
18989,9/30/20,19745384,"Vote-by-mail is not full of fraud, despite Tru...",56069385,Opinion,Neutral,Pro-Voter Policies,Provisional Ballots,Neutral,"“A solicited ballot is okay,”",Donald Trump OLD,Federal Official,Male,Other,VOX,Jen Kirby,VR: Pro-Voter Policies,Wisconsin


### Mapping Dictionaries to the Labelled Data to Create New Columns

In [14]:
# Mapping the id-to-fulltext dictionary to create a Full Text column in the labelled df
labelled_df['Full Text']= labelled_df['Article ID'].map(id_fulltext_dict)

In [15]:
# Mapping the id-to-cleanedauthor dictionary to create a Cleaned Author column in the labelled df
labelled_df['Cleaned Author']= labelled_df['Article ID'].map(id_cleanedauthor_dict)

In [16]:
# Sanity check to see if these columns have been added
labelled_df.head(1)

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,QText,Source Name,Source Type,Source Gender,Constituent Group,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,Cleaned Author
8,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,57544827,News,Neutral,Election Administration,Budget,Positive,Democrats had the opportunity to spend money i...,Donna Brazile,Partisans/Fmr. Politicians,Female,Other,The New York Times,By Shane Goldmacher,,South Carolina,The Democratic National Committee has a roughl...,Shane Goldmacher


In [17]:
# Noting how many records are in the labelled data df so far, for reference later
num_records_pre_clean = len(labelled_df)
num_records_pre_clean

4240

### Cleaning Updated Labelled df

In [18]:
# Drop articles that don't have a corresponding full text (where full text value is NaN)
labelled_df = labelled_df[labelled_df['Full Text'].notna()]

#Drop any remaining Spanish-language articles
spanish_outlets = ['El Nuevo Herald', 'El Diario La Prensa', 'Univision', 'AP Spanish Worldstream']

labelled_df = labelled_df[-labelled_df["Media Name"].isin(spanish_outlets)]

labelled_df

Unnamed: 0,artdate,Article ID,Headline,Quote ID,Article Status,Article Position,Messages,Submessages,Quote Position,QText,Source Name,Source Type,Source Gender,Constituent Group,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,Cleaned Author
8,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,57544827,News,Neutral,Election Administration,Budget,Positive,Democrats had the opportunity to spend money i...,Donna Brazile,Partisans/Fmr. Politicians,Female,Other,The New York Times,By Shane Goldmacher,,South Carolina,The Democratic National Committee has a roughl...,Shane Goldmacher
11,1/31/21,29458845,"A call for another Great Migration, this time ...",57544985,News,Neutral,Civic Participation,Voter Registration,Positive,Georgia's expanding African American populatio...,Charles Blow,Media/Journalist,Male,,The Washington Post,Carlos Lozada,,Georgia,A Black Power Manifesto By Charles M. Blow. Ha...,Carlos Lozada
14,1/31/21,29474652,Trump Raised $255.4 Million in 8 Weeks as He S...,57522610,Opinion,Neutral,Civic Participation,Overall Turnout,Neutral,Mr. Trump’s strongest fund-raising came in the...,Shane Goldmacher,Media/Journalist,Male,,The New York Times,By Shane Goldmacher and Rachel Shorey,"VR: Anti-Voter Policies, VR: Civic Participati...",Commentary,The former president’s fund-raising slowed sig...,"Shane Goldmacher, Rachel Shorey"
17,1/31/21,29484873,Democrats are faced with a choice. Protect the...,57545348,Opinion,Positive,Anti-Voter Policies,Voter ID,Positive,The 106 bills the center identified in 28 stat...,Shane Goldmacher,Media/Journalist,Male,,Washington Post.com,By E.J. Dionne Jr.,"VR: Pro-Voter Policies, H.R. 1, VR: Anti-Voter...",Commentary,The Democrats can use their House and Senate m...,E.J. Dionne Jr.
24,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,57524790,News,Neutral,Anti-Voter Policies,Voter ID,Positive,laws such as prohibiting online voter registra...,Thomas Buser-Clancy,Constituent Group,Male,ACLU,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia,"AUSTIN, Texas (AP) — Republican lawmakers in s...","Anthony Izaguirre, Acacia Coronado"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18968,10/1/20,19978767,"County, AG take vote spar to state's top court...",57720233,News,Neutral,Pro-Voter Policies,Vote by Mail,Neutral,The deadline for voters to apply for a mail ba...,Susan Hays,Private Attorney,Female,,Houston Chronicle,Zach Despart; Staff writer,,Texas,Harris County Clerk Christopher Hollins' plan ...,
18969,10/1/20,19943666,Dawn Porter on Documenting Rep. John Lewis: 'T...,57720248,News,Neutral,,,Neutral,"Clearly, as we are nearing closer and closer t...",,,,,The Root,"No by-line,",VR: Anti-Voter Policies,Georgia,It’s been a few months since the world lost on...,
18971,10/1/20,19884512,Debate has little substance for voters; Lack o...,57720268,News,Neutral,Civic Participation,Partisan Turnout,Neutral,I think they were looking for a reason to vote...,Catherine Alonzo,Partisans/Fmr. Politicians,Female,,The Arizona Republic,By Yvonne Wingett Sanchez and Ronald J. Hansen...,,Arizona,If Arizona voters tuned in to the first presid...,"Yvonne Wingett Sanchez, Ronald J. Hansen"
18972,10/1/20,19933963,Elecciones 2020: Lo que necesitas saber para v...,57720288,News,Neutral,,,Neutral,Los residentes del Condado Pima están haciendo...,,,,,The Arizona Daily Star,Veronica M. Cruz,VR: Civic Participation,Arizona,La elección general es el 3 de noviembre\n\nPo...,Veronica M. Cruz


In [19]:
# Noting how many records are in the labelled data df now that records without full text have been dropped
num_records_post_clean = len(labelled_df)
num_records_post_clean

3547

In [20]:
# Determining how many articles did not have an associated full text based on how many records
# there were before and after the data cleaning.
num_records_wout_fulltext = num_records_pre_clean - num_records_post_clean
print("Number of Articles Dropped Due to Lack of Full text (or Spanish): " + str(num_records_wout_fulltext))

Number of Articles Dropped Due to Lack of Full text (or Spanish): 693


In [21]:
#Drop unnecessary quote-level columns
labelled_df = labelled_df.drop(['Quote ID', 
                                'Quote Position',
                                'QText',
                                'Messages',
                                'Submessages',
                                'Source Name', 
                                'Source Type', 
                                'Source Gender',
                                'Constituent Group'], axis=1)

### Add hand-annotated edits: opinion article type, and edits to News/Opinion labels (i.e. quality control edits where the orginal label was incorrect)

In [26]:
op_type_df = pd.read_excel("../preliminary_data/Labelled_VR_data_opinion_type_annotation.xlsx")
label_qc_df = pd.read_excel("../preliminary_data/Labelled_VR_data_labelQC_annotation.xlsx")

#Create dict of label_qc edits column
id_qcedits_dict = pd.Series(label_qc_df.edits.values,index=label_qc_df["Article ID"]).to_dict()

#Create dict of opinion type edits column and article type column
id_op_type_edits_dict = pd.Series(op_type_df.Edits.values,index=op_type_df["Article ID"]).to_dict()
id_op_type_dict = pd.Series(op_type_df["opinion_type"].values,index=op_type_df["Article ID"]).to_dict()


#Map label_qc edits column to main df
labelled_df['QCedits']= labelled_df['Article ID'].map(id_qcedits_dict)

#Map edits column from opinion type file to main df
labelled_df['op_type_edits']= labelled_df['Article ID'].map(id_op_type_edits_dict)


#Drop articles identified as irrelevant in hand annotation
labelled_df = labelled_df[-(labelled_df.op_type_edits == "Irrelevant") & -(labelled_df.QCedits == "Irrelevant")]


#Create article type column
labelled_df['opinion_type']= labelled_df['Article ID'].map(id_op_type_dict)

#Fill news_opinion column first with edits from annotated data, then with the original labels
labelled_df["news_opinion"] = labelled_df["QCedits"].fillna(labelled_df["op_type_edits"]).fillna(labelled_df["Article Status"])

#Replace old labels column with new, edited labels column
labelled_df["Article Status"] = labelled_df["news_opinion"]

labelled_df = labelled_df.drop(['op_type_edits', 'QCedits', 'news_opinion'], axis=1)

In [28]:
# Exporting the modified Labelled Data to excel
# Changed from csv to excel b/c csv file was exporting with an empty row at row 248 
labelled_df.to_excel("../cleaned_data/Labelled_VR_data_Oct2020_Jan2021_wfulltext.xlsx", index=False)

# Creating a File of all Unlabelled Articles 

In [63]:
#Create list of Article IDs for labelled articles
labelled_articleid_list = labelled_df["Article ID"].to_list()

#Filter the full text file to only keep articles for which labelled data does not exist
full_text_df = full_text_df[-full_text_df['id'].isin(labelled_articleid_list)]

In [64]:
# Only keep necessary columns
unlabelled_df = full_text_df[['id', 
                             'publisher', 
                             'subject', 
                             'author', 
                             'body', 
                             'artdate',
                             'cleaned_author']]

In [65]:
# Exporting to csv
unlabelled_df.to_csv("../cleaned_data/Unlabelled_Articles_VR_data_Oct2020_Jan2021.csv")