In [507]:
import pandas as pd
import numpy as np

In [508]:
df = pd.read_excel('../../Data/cleaned_data/Labelled_VR_data_Oct2020_Jan2021_wfulltext.xlsx')

### Rule-based classification using regex

In [509]:
#These Journalist Name terms are only found in opinion articles
name_only_opinion = ["opinion",
                    "letters? to the editor",
                    "letters\:",
                    "editorial board",
                    "readers", 
                    "columnist"]

#These Headline terms are only found in opinion articles
headline_only_opinion = ["letters? to the editor",
                        "letters\:", 
                        "columnist"]

#These Full Text terms are only found in opinion articles
fulltext_only_opinion = ["letters? to the editor"]


#These Journalist Name terms are only found in news articles
name_only_news = ["contributed",
            "bureau",  
            "compiled by"]

#These Full Text terms are only found in news articles
fulltext_only_news = ["\(ap\)",
                    "contributed to this report"]

#Format text fields for searching
df["fulltext_lower"] = df["Full Text"].fillna("").str.lower()
df["journo_name_lower"] = df["Journalist Name"].fillna("").str.lower()
df["headline_lower"] = df["Headline"].fillna("").str.lower()


#Create column for articles matching news-only regex patterns
#or associated press in name field where headline does not equal 'editorial roundup'
df["news_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_news), regex=True))|
                           (df["journo_name_lower"].str.contains("|".join(name_only_news), regex=True))|
                           ((df["journo_name_lower"].str.contains("associated press")) & 
                            -(df["headline_lower"].str.contains("editorial roundup")))), 
                          "Present", "Not present")

#Create column for articles matching opinion-only regex patterns
df["opinion_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_opinion), regex=True))|
                              (df["journo_name_lower"].str.contains("|".join(name_only_opinion), regex=True))|
                               (df["headline_lower"].str.contains("|".join(headline_only_opinion), regex=True))),
                              "Present", "Not present")

#Code to check results of rule-based classification
#df.groupby(["opinion_rule","news_rule","news_opinion"]).size()

### Possible terms for features

In [510]:
#These terms appear more often in opinion articles
name_lean_opinion = ["editor"]

headline_lean_opinion = ["opinion", 
                          "editor",
                          "editorial",
                          "column"]

fulltext_lean_opinion = ["editorial board",
                         "columnist"]


#These terms appear more often in news articles
name_lean_news = ["staff writer",
            "staff", 
            "news"]

headline_lean_news = ["news"]

fulltext_lean_news = ["associated press",
            "reuters",
            "staff writer",
            "staff writers",
            "staff", 
            "bureau", 
            "contributed", 
            "compiled by", 
            "correspondent"]

In [511]:
def create_feature_from_terms(term_list, column, df=df):
    '''(List, series, df -> df) loop through (lower-case) list of terms, check for presence of that term in the specified column, 
    and create feature column where 1 denotes presence of the term'''
    
    for item in term_list:
        df[item + "_" + column + "_" + "feature"] = np.where(df[column].fillna("").str.lower().str.contains(item), 1, 0)
    
    return df


In [512]:
#Create dummy-coded (0/1) feature columns from terms lists
df = create_feature_from_terms(name_lean_opinion, "Media Name")
df = create_feature_from_terms(headline_lean_opinion, "Headline")
df = create_feature_from_terms(fulltext_lean_opinion, "Full Text")
df = create_feature_from_terms(name_lean_news, "Media Name")
df = create_feature_from_terms(headline_lean_news, "Headline")
df = create_feature_from_terms(fulltext_lean_news, "Full Text")


In [513]:
#Additional features

#Dateline feature
#News articles sometimes start with datelines, which are ALL CAPS
#This checks if the article starts with at least 3 all caps letters
df["upper_start_feature"] = np.where(df["Full Text"].str.contains('^[A-Z]{3,20} ', regex=True), 1, 0)

#MediaName feature
#The Hill, Associated Press and Reuters are all mainly news articles
df["media_lean_news_feature"] = np.where(df["Media Name"].fillna("").str.contains("Reuters|Associated Press|thehill|The Hill", regex=True), 1, 0)

### Features to Normalize

In [514]:
#Headline length feature
#On average, news headlines are slightly longer than opinion headlines
#df.groupby(["news_opinion"])["headline_length"].mean()
df["headline_length_feature"] = df["Headline"].str.len()

#Author count feature
#News articles tend to have slightly more authors
#df.groupby(["news_opinion","author_count"]).size()
df["author_count_feature"] = df["Cleaned Author"].str.count(", ") + 1



In [487]:
#Code to check how predictive each of these terms are of news vs. opinion


#loop through the list of terms and check for presence of that term in the author column/headline column, e.g.
for item in name_lean_opinion:
    df[item] = np.where(df["Full Text"].fillna("").str.lower().str.contains(item), "Present", "Not present")
    #Then check that against the 'Article Status' column e.g.
    resultdf = df.groupby([item, "news_opinion"]).size()
    print(resultdf)
    


editor       newsop_edited
Not present  News             2211
             Opinion           396
Present      News               92
             Opinion            69
dtype: int64


### Notes about other feature ideas I explored

In [348]:
#last sentence contains a quote - difficult to parse b/c a lot of stories have other random
#text at the end, not a super strong trend, ~50 news articles vs ~5 opinion articles 

#tariq phrases - your article, etc - doesn't seem to pull in anything

#run tf-idf to identify top unique words in each set of articles? - tried this, 
#didn't really yield anything useful



In [505]:
#df[df["Headline"].str.contains("Editorial", regex=True)].head(60)

### Other feature explorations

#### Identifying articles with a quote in the last paragraph

In [351]:
df["lastpara"] = df["Full Text"].str.extract("\n([A-Za-z0-9\,\.\;\-\"# \(\)%]{60,}$)")
df["endquote"] = np.where(df.lastpara.fillna("").str.contains("\".{20,100}\"", regex=True), "End quote", "No end quote")

In [352]:
df.endquote.value_counts()

No end quote    3491
End quote         53
Name: endquote, dtype: int64

In [353]:
df.groupby(["endquote","news_opinion"]).size()

endquote      newsop_edited
End quote     News               48
              Opinion             5
No end quote  News             2900
              Opinion           591
dtype: int64

In [354]:
df[df.lastpara.fillna("").str.contains("\".{30,100}\"", regex=True)]

Unnamed: 0,artdate,Article ID,Headline,Article Status,Article Position,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,...,staff,editor-at-large,news,correspondent,headline_length,author_count,upper_start,nobyline,lastpara,endquote
453,1/5/21,27129180,"THE NATION; As protests loom, mayor of D.C. ca...",News,Neutral,Los Angeles Times,"No by-line,",,Washington,"Bracing for possible violence, the nation's ca...",...,Not present,Not present,Not present,Not present,155,,False,No by line,"""We will not allow white supremacist violence ...",End quote
475,1/5/21,27190114,Fischer will vote to reject Trump challenge of...,News,Neutral,journalstar,Don Walton,,Nebraska,Sen. Deb Fischer said Tuesday she will vote to...,...,Not present,Not present,Not present,Not present,62,1.0,False,Other,Fischer said she swore an oath to support and ...,End quote
477,1/5/21,27187421,"Sen. Tim Scott 'grateful' for Trump, but oppos...",News,Neutral,Fox News,Ronn Blitzer,,,"Sen. Tim Scott, R-S.C., thanked President Trum...",...,Not present,Not present,Not present,Not present,89,1.0,False,Other,"Scott also said he was ""grateful for all of th...",End quote
688,12/23/20,26205961,Pennsylvania Republican slapped with felony ch...,News,Neutral,Fox News,Danielle Wallace,,Delaware,A Pennsylvania man is facing several felony co...,...,Not present,Not present,Not present,Not present,91,1.0,False,Other,"His defense attorney, Samuel Stretton, told th...",End quote
800,12/16/20,25574253,Republican leaders accept Biden's win; After a...,News,Neutral,South Florida Sun-Sentinel,By Will Weissert Associated Press,,,WASHINGTON - More than a month after the elect...,...,Not present,Not present,Not present,Not present,120,1.0,True,Other,"""I need two senators from this state who want ...",End quote
803,12/16/20,25564225,THE PRESIDENT-ELECT IN GEORGIA: Biden tells Ge...,News,Neutral,The Atlanta Journal - Constitution,Greg Bluestein,,Georgia,President-elect Joe Biden appealed to Georgian...,...,Not present,Not present,Not present,Not present,138,1.0,False,Other,"""Maybe your senators were just confused. Maybe...",End quote
806,12/16/20,25582174,Trump tells McConnell it's too soon to 'give u...,News,Neutral,thehill,Dominick Mastrangelo -,,,President Trump told Senate Majority Leader Mi...,...,Not present,Not present,Not present,Not present,85,1.0,False,Other,"""The only date in the Constitution is Jan. 20....",End quote
988,12/7/20,24758366,Trump adviser Kudlow praises Yellen Treasury n...,News,Neutral,Politico,"No by-line,",,Washington,White House chief economic adviser Larry Kudlo...,...,Not present,Not present,Not present,Not present,64,,False,No by line,Kudlow said he believes that the bipartisan le...,End quote
1014,12/6/20,24641584,THE NATION; Trump hammers Georgia over Biden w...,News,Neutral,Los Angeles Times,"No by-line,",,Georgia,President Trump pressed his grievances over lo...,...,Not present,Not present,Not present,Not present,159,,False,No by line,"""I want to see what happens between now and Ja...",End quote
1017,12/6/20,24659805,New director steps into whirlwind election,News,Neutral,The Sun,MARA KNAUB SUN STAFF WRITER,,Arizona,"Tiffany Anderson ""stepped into the middle of a...",...,Present,Not present,Not present,Not present,42,1.0,False,Other,"""If you want your vote to count, make sure you...",End quote


In [355]:
df.lastpara[(df.endquote == "End quote") & (df.news_opinion == "Opinion")].value_counts()

Leslie Marshall joined Fox News Channel as a contributor in 2009; providing analysis on both political and social issues from a liberal point of view. A nationally syndicated talk host, whose program, "The Leslie Marshall Show" can be heard on radio, stream, "Tune In," "The Progressive Voices Radio Network," and "The Armed Forces Radio Network."               2
Ben Shapiro, 36, is a graduate of UCLA and Harvard Law School, host of "The Ben Shapiro Show" and editor-in-chief of DailyWire.com. He is the author of the New York Times bestsellers "How to Destroy America in Three Easy Steps," "The Right Side of History" and "Bullies."                                                                                           1
Reed Galen and Steve Schmidt are co-founders of The Lincoln Project. Rick Wilson is a co-founder and author of "Everything Trump Touches Dies." Stuart Stevens is a senior adviser to The Lincoln Project and author of "It Was All A Lie." This column also reflects the views 

#### TF-IDF word comparisons 

In [356]:
def remove_digits(comment):
    return ''.join([ch for ch in comment if not ch.isdigit()])

df['body_without_digits'] = df['Full Text'].apply(remove_digits)

In [357]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()
sparse_dtm = countvec.fit_transform(df['body_without_digits'])

In [358]:
sparse_dtm

<3544x40082 sparse matrix of type '<class 'numpy.longlong'>'
	with 1186487 stored elements in Compressed Sparse Row format>

In [359]:
dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=df.index)
dtm.head()

Unnamed: 0,__,___,____,______,________,__________,___________,____________,_____________,______________,...,ïfi,último,𝗟𝗼𝗿𝗱𝗲,𝗧𝗵𝗶𝘀,𝗯𝘆,𝗰𝗹𝗮𝗶𝗺,𝗱𝗶𝘀𝗽𝘂𝘁𝗲𝗱,𝗶𝘀,𝗼𝗳𝗳𝗶𝗰𝗶𝗮𝗹,𝘀𝗼𝘂𝗿𝗰𝗲𝘀
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [360]:
dtm.sum().sort_values(ascending=False).head(10)

the      152759
to        82051
of        66717
and       61078
in        59195
that      38309
for       24852
on        23517
is        21824
trump     20595
dtype: int64

In [361]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(df['body_without_digits'])
sparse_tfidf

<3544x40082 sparse matrix of type '<class 'numpy.float64'>'
	with 1186487 stored elements in Compressed Sparse Row format>

In [362]:
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names(), index=df.index)
tfidf.head()

Unnamed: 0,__,___,____,______,________,__________,___________,____________,_____________,______________,...,ïfi,último,𝗟𝗼𝗿𝗱𝗲,𝗧𝗵𝗶𝘀,𝗯𝘆,𝗰𝗹𝗮𝗶𝗺,𝗱𝗶𝘀𝗽𝘂𝘁𝗲𝗱,𝗶𝘀,𝗼𝗳𝗳𝗶𝗰𝗶𝗮𝗹,𝘀𝗼𝘂𝗿𝗰𝗲𝘀
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.091635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [363]:
tfidf.max().sort_values(ascending=False).head(20)

jacquet        0.839411
nugent         0.800731
hmong          0.787309
pm             0.785803
silva          0.765315
cavedo         0.764103
aapi           0.757974
nehls          0.746435
oct            0.745869
mcauliffe      0.744649
dunleavy       0.744487
scaringi       0.740156
waller         0.733463
blalock        0.719133
faulkner       0.717448
lenick         0.715341
swain          0.708278
mr             0.707036
scaramucci     0.700058
scarborough    0.696291
dtype: float64

In [365]:
tfidf['newsop_'] = df['news_opinion']
tfidf.head()

Unnamed: 0,__,___,____,______,________,__________,___________,____________,_____________,______________,...,último,𝗟𝗼𝗿𝗱𝗲,𝗧𝗵𝗶𝘀,𝗯𝘆,𝗰𝗹𝗮𝗶𝗺,𝗱𝗶𝘀𝗽𝘂𝘁𝗲𝗱,𝗶𝘀,𝗼𝗳𝗳𝗶𝗰𝗶𝗮𝗹,𝘀𝗼𝘂𝗿𝗰𝗲𝘀,newsop_
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,News
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,News
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,News
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Opinion
4,0.0,0.091635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,News


In [368]:
news = tfidf[tfidf['newsop_']=='News']
opinion = tfidf[tfidf['newsop_']=='Opinion']

news.max(numeric_only=True).sort_values(ascending=False).head(60)

jacquet        0.839411
nugent         0.800731
hmong          0.787309
pm             0.785803
silva          0.765315
cavedo         0.764103
aapi           0.757974
nehls          0.746435
oct            0.745869
mcauliffe      0.744649
dunleavy       0.744487
scaringi       0.740156
waller         0.733463
blalock        0.719133
lenick         0.715341
swain          0.708278
mr             0.707036
scaramucci     0.700058
scarborough    0.696291
lott           0.694615
wray           0.692358
mast           0.688324
dmv            0.685107
baker          0.684866
madigan        0.684761
harrison       0.684466
cryer          0.684203
alec           0.681506
hensley        0.680196
massey         0.672317
rubin          0.668691
kudlow         0.668022
welker         0.664713
peters         0.655611
seminole       0.653099
mason          0.652816
larose         0.652333
hotline        0.652246
jenner         0.651059
jeffries       0.647698
padilla        0.643582
cox            0