In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('Labelled_VR_data_Oct2020_Jan2021_wfulltext.xlsx')

### Rule-based classification using regex

In [3]:
#These Journalist Name terms are only found in opinion articles
name_only_opinion = ["opinion",
                    "letters? to the editor",
                    "letters\:",
                    "editorial board",
                    "readers", 
                    "columnist"]

#These Headline terms are only found in opinion articles
headline_only_opinion = ["letters? to the editor",
                        "letters\:", 
                        "columnist"]

#These Full Text terms are only found in opinion articles
fulltext_only_opinion = ["letters? to the editor"]


#These Journalist Name terms are only found in news articles
name_only_news = ["contributed",
            "bureau",  
            "compiled by"]

#These Full Text terms are only found in news articles
fulltext_only_news = ["\(ap\)",
                    "contributed to this report"]

#Format text fields for searching
df["fulltext_lower"] = df["Full Text"].fillna("").str.lower()
df["journo_name_lower"] = df["Journalist Name"].fillna("").str.lower()
df["headline_lower"] = df["Headline"].fillna("").str.lower()


#Create column for articles matching news-only regex patterns
#or associated press in name field where headline does not equal 'editorial roundup'
df["news_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_news), regex=True))|
                           (df["journo_name_lower"].str.contains("|".join(name_only_news), regex=True))|
                           ((df["journo_name_lower"].str.contains("associated press")) & 
                            -(df["headline_lower"].str.contains("editorial roundup")))), 
                          "Present", "Not present")

#Create column for articles matching opinion-only regex patterns
df["opinion_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_opinion), regex=True))|
                              (df["journo_name_lower"].str.contains("|".join(name_only_opinion), regex=True))|
                               (df["headline_lower"].str.contains("|".join(headline_only_opinion), regex=True))),
                              "Present", "Not present")

#Code to check results of rule-based classification
#df.groupby(["opinion_rule","news_rule","news_opinion"]).size()

### Possible terms for features

In [4]:
#These terms appear more often in opinion articles
name_lean_opinion = ["editor"]

headline_lean_opinion = ["opinion", 
                          "editor",
                          "editorial",
                          "column"]

fulltext_lean_opinion = ["editorial board",
                         "columnist"]


#These terms appear more often in news articles
name_lean_news = ["staff writer",
            "staff", 
            "news"]

headline_lean_news = ["news"]

fulltext_lean_news = ["associated press",
            "reuters",
            "staff writer",
            "staff writers",
            "staff", 
            "bureau", 
            "contributed", 
            "compiled by", 
            "correspondent"]

In [5]:
def create_feature_from_terms(term_list, column, df=df):
    '''(List, series, df -> df) loop through (lower-case) list of terms, check for presence of that term in the specified column, 
    and create feature column where 1 denotes presence of the term'''
    
    for item in term_list:
        df[item + "_" + column + "_" + "feature"] = np.where(df[column].fillna("").str.lower().str.contains(item), 1, 0)
    
    return df


In [6]:
#Create dummy-coded (0/1) feature columns from terms lists
df = create_feature_from_terms(name_lean_opinion, "Media Name")
df = create_feature_from_terms(headline_lean_opinion, "Headline")
df = create_feature_from_terms(fulltext_lean_opinion, "Full Text")
df = create_feature_from_terms(name_lean_news, "Media Name")
df = create_feature_from_terms(headline_lean_news, "Headline")
df = create_feature_from_terms(fulltext_lean_news, "Full Text")


In [7]:
#Additional features

#Dateline feature
#News articles sometimes start with datelines, which are ALL CAPS
#This checks if the article starts with at least 3 all caps letters
df["upper_start_feature"] = np.where(df["Full Text"].str.contains('^[A-Z]{3,20} ', regex=True), 1, 0)

#MediaName feature
#The Hill, Associated Press and Reuters are all mainly news articles
df["media_lean_news_feature"] = np.where(df["Media Name"].fillna("").str.contains("Reuters|Associated Press|thehill|The Hill", regex=True), 1, 0)

### Features to Normalize

In [8]:
#Headline length feature
#On average, news headlines are slightly longer than opinion headlines
#df.groupby(["news_opinion"])["headline_length"].mean()
df["headline_length_feature"] = df["Headline"].str.len()

#Author count feature
#News articles tend to have slightly more authors
#df.groupby(["news_opinion","author_count"]).size()
df["author_count_feature"] = df["Cleaned Author"].str.count(", ") + 1



In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
a = df["Headline"].str.len()
b = df["Cleaned Author"].str.count(", ") + 1
zipped = list(zip(a,b))
fitted = scaler.fit(zipped)
arr = scaler.transform(zipped)
t = zip(*arr)
new = list(t)
df['minmax_length'] = new[0]
df['minmax_author'] = new[1]

In [10]:
m1 = max(df["headline_length_feature"])
normalized1 = df["headline_length_feature"]/m1
df['normalized_length'] = normalized1
m2 = max(df["author_count_feature"])
normalized2 = df["author_count_feature"]/m2
df['normalized_author_count'] = normalized2
df

Unnamed: 0,artdate,Article ID,Headline,Article Status,Article Position,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,...,compiled by_Full Text_feature,correspondent_Full Text_feature,upper_start_feature,media_lean_news_feature,headline_length_feature,author_count_feature,minmax_length,minmax_author,normalized_length,normalized_author_count
0,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,News,Neutral,The New York Times,By Shane Goldmacher,,South Carolina,The Democratic National Committee has a roughl...,...,0,0,0,0,73,1.0,0.142232,0.00,0.156989,0.2
1,1/31/21,29458845,"A call for another Great Migration, this time ...",News,Neutral,The Washington Post,Carlos Lozada,,Georgia,A Black Power Manifesto By Charles M. Blow. Ha...,...,0,0,0,0,56,1.0,0.105033,0.00,0.120430,0.2
2,1/31/21,29474652,Trump Raised $255.4 Million in 8 Weeks as He S...,Opinion,Neutral,The New York Times,By Shane Goldmacher and Rachel Shorey,"VR: Anti-Voter Policies, VR: Civic Participati...",Commentary,The former president’s fund-raising slowed sig...,...,0,0,0,0,79,2.0,0.155361,0.25,0.169892,0.4
3,1/31/21,29484873,Democrats are faced with a choice. Protect the...,Opinion,Positive,Washington Post.com,By E.J. Dionne Jr.,"VR: Pro-Voter Policies, H.R. 1, VR: Anti-Voter...",Commentary,The Democrats can use their House and Senate m...,...,0,0,0,0,79,1.0,0.155361,0.00,0.169892,0.2
4,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,News,Neutral,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia,"AUSTIN, Texas (AP) — Republican lawmakers in s...",...,0,0,0,1,60,2.0,0.113786,0.25,0.129032,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,10/1/20,19978767,"County, AG take vote spar to state's top court...",News,Neutral,Houston Chronicle,Zach Despart; Staff writer,,Texas,Harris County Clerk Christopher Hollins' plan ...,...,0,0,0,0,109,,0.221007,,0.234409,
3531,10/1/20,19943666,Dawn Porter on Documenting Rep. John Lewis: 'T...,News,Neutral,The Root,"No by-line,",VR: Anti-Voter Policies,Georgia,It’s been a few months since the world lost on...,...,0,0,0,0,99,,0.199125,,0.212903,
3532,10/1/20,19884512,Debate has little substance for voters; Lack o...,News,Neutral,The Arizona Republic,By Yvonne Wingett Sanchez and Ronald J. Hansen...,,Arizona,If Arizona voters tuned in to the first presid...,...,0,0,0,0,87,2.0,0.172867,0.25,0.187097,0.4
3533,10/1/20,19933963,Elecciones 2020: Lo que necesitas saber para v...,News,Neutral,The Arizona Daily Star,Veronica M. Cruz,VR: Civic Participation,Arizona,La elección general es el 3 de noviembre\n\nPo...,...,0,0,0,0,78,1.0,0.153173,0.00,0.167742,0.2


In [11]:
#Code to check how predictive each of these terms are of news vs. opinion


#loop through the list of terms and check for presence of that term in the author column/headline column, e.g.
for item in name_lean_opinion:
    df[item] = np.where(df["Full Text"].fillna("").str.lower().str.contains(item), "Present", "Not present")
    #Then check that against the 'Article Status' column e.g.
    resultdf = df.groupby([item, "news_opinion"]).size()
    print(resultdf)
    


editor       news_opinion
Not present  News            2873
             Opinion          430
Present      News             113
             Opinion          119
dtype: int64


### Notes about other feature ideas I explored

In [12]:
#last sentence contains a quote - difficult to parse b/c a lot of stories have other random
#text at the end, not a super strong trend, ~50 news articles vs ~5 opinion articles 

#tariq phrases - your article, etc - doesn't seem to pull in anything

#run tf-idf to identify top unique words in each set of articles? - tried this, 
#didn't really yield anything useful



In [13]:
#df[df["Headline"].str.contains("Editorial", regex=True)].head(60)

### Other feature explorations

#### Identifying articles with a quote in the last paragraph

In [14]:
df["lastpara"] = df["Full Text"].str.extract("\n([A-Za-z0-9\,\.\;\-\"# \(\)%]{60,}$)")
df["endquote"] = np.where(df.lastpara.fillna("").str.contains("\".{20,100}\"", regex=True), "End quote", "No end quote")

In [15]:
df.endquote.value_counts()

No end quote    3482
End quote         53
Name: endquote, dtype: int64

In [16]:
df.groupby(["endquote","news_opinion"]).size()

endquote      news_opinion
End quote     News              48
              Opinion            5
No end quote  News            2938
              Opinion          544
dtype: int64

In [17]:
df[df.lastpara.fillna("").str.contains("\".{30,100}\"", regex=True)]

Unnamed: 0,artdate,Article ID,Headline,Article Status,Article Position,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,...,media_lean_news_feature,headline_length_feature,author_count_feature,minmax_length,minmax_author,normalized_length,normalized_author_count,editor,lastpara,endquote
452,1/5/21,27129180,"THE NATION; As protests loom, mayor of D.C. ca...",News,Neutral,Los Angeles Times,"No by-line,",,Washington,"Bracing for possible violence, the nation's ca...",...,0,155,,0.321663,,0.333333,,Not present,"""We will not allow white supremacist violence ...",End quote
474,1/5/21,27190114,Fischer will vote to reject Trump challenge of...,News,Neutral,journalstar,Don Walton,,Nebraska,Sen. Deb Fischer said Tuesday she will vote to...,...,0,62,1.0,0.118162,0.0,0.133333,0.2,Not present,Fischer said she swore an oath to support and ...,End quote
476,1/5/21,27187421,"Sen. Tim Scott 'grateful' for Trump, but oppos...",News,Neutral,Fox News,Ronn Blitzer,,,"Sen. Tim Scott, R-S.C., thanked President Trum...",...,0,89,1.0,0.177243,0.0,0.191398,0.2,Not present,"Scott also said he was ""grateful for all of th...",End quote
687,12/23/20,26205961,Pennsylvania Republican slapped with felony ch...,News,Neutral,Fox News,Danielle Wallace,,Delaware,A Pennsylvania man is facing several felony co...,...,0,91,1.0,0.181619,0.0,0.195699,0.2,Not present,"His defense attorney, Samuel Stretton, told th...",End quote
799,12/16/20,25574253,Republican leaders accept Biden's win; After a...,News,Neutral,South Florida Sun-Sentinel,By Will Weissert Associated Press,,,WASHINGTON - More than a month after the elect...,...,0,120,1.0,0.245077,0.0,0.258065,0.2,Not present,"""I need two senators from this state who want ...",End quote
802,12/16/20,25564225,THE PRESIDENT-ELECT IN GEORGIA: Biden tells Ge...,News,Neutral,The Atlanta Journal - Constitution,Greg Bluestein,,Georgia,President-elect Joe Biden appealed to Georgian...,...,0,138,1.0,0.284464,0.0,0.296774,0.2,Not present,"""Maybe your senators were just confused. Maybe...",End quote
805,12/16/20,25582174,Trump tells McConnell it's too soon to 'give u...,News,Neutral,thehill,Dominick Mastrangelo -,,,President Trump told Senate Majority Leader Mi...,...,1,85,1.0,0.16849,0.0,0.182796,0.2,Not present,"""The only date in the Constitution is Jan. 20....",End quote
987,12/7/20,24758366,Trump adviser Kudlow praises Yellen Treasury n...,News,Neutral,Politico,"No by-line,",,Washington,White House chief economic adviser Larry Kudlo...,...,0,64,,0.122538,,0.137634,,Not present,Kudlow said he believes that the bipartisan le...,End quote
1013,12/6/20,24641584,THE NATION; Trump hammers Georgia over Biden w...,News,Neutral,Los Angeles Times,"No by-line,",,Georgia,President Trump pressed his grievances over lo...,...,0,159,,0.330416,,0.341935,,Not present,"""I want to see what happens between now and Ja...",End quote
1016,12/6/20,24659805,New director steps into whirlwind election,News,Neutral,The Sun,MARA KNAUB SUN STAFF WRITER,,Arizona,"Tiffany Anderson ""stepped into the middle of a...",...,0,42,1.0,0.074398,0.0,0.090323,0.2,Not present,"""If you want your vote to count, make sure you...",End quote


In [18]:
df.lastpara[(df.endquote == "End quote") & (df.news_opinion == "Opinion")].value_counts()

Leslie Marshall joined Fox News Channel as a contributor in 2009; providing analysis on both political and social issues from a liberal point of view. A nationally syndicated talk host, whose program, "The Leslie Marshall Show" can be heard on radio, stream, "Tune In," "The Progressive Voices Radio Network," and "The Armed Forces Radio Network."               2
Ben Shapiro, 36, is a graduate of UCLA and Harvard Law School, host of "The Ben Shapiro Show" and editor-in-chief of DailyWire.com. He is the author of the New York Times bestsellers "How to Destroy America in Three Easy Steps," "The Right Side of History" and "Bullies."                                                                                           1
Reed Galen and Steve Schmidt are co-founders of The Lincoln Project. Rick Wilson is a co-founder and author of "Everything Trump Touches Dies." Stuart Stevens is a senior adviser to The Lincoln Project and author of "It Was All A Lie." This column also reflects the views 

#### TF-IDF word comparisons 

In [22]:
def remove_digits(comment):
    return ''.join([ch for ch in comment if not ch.isdigit()])

df['body_without_digits'] = df['Full Text'].apply(remove_digits)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()
sparse_dtm = countvec.fit_transform(df['body_without_digits'])

In [24]:
sparse_dtm

<3535x39924 sparse matrix of type '<class 'numpy.int64'>'
	with 1183504 stored elements in Compressed Sparse Row format>

In [25]:
dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=df.index)
dtm.head()

Unnamed: 0,__,___,____,______,________,__________,___________,____________,_____________,______________,...,ïfi,último,𝗟𝗼𝗿𝗱𝗲,𝗧𝗵𝗶𝘀,𝗯𝘆,𝗰𝗹𝗮𝗶𝗺,𝗱𝗶𝘀𝗽𝘂𝘁𝗲𝗱,𝗶𝘀,𝗼𝗳𝗳𝗶𝗰𝗶𝗮𝗹,𝘀𝗼𝘂𝗿𝗰𝗲𝘀
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
dtm.sum().sort_values(ascending=False).head(10)

the      152379
to        81735
of        66544
and       60878
in        59034
that      38093
for       24795
on        23453
is        21700
trump     20558
dtype: int64

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer()
sparse_tfidf = tfidfvec.fit_transform(df['body_without_digits'])
sparse_tfidf

<3535x39924 sparse matrix of type '<class 'numpy.float64'>'
	with 1183504 stored elements in Compressed Sparse Row format>

In [None]:
tfidf = pd.DataFrame(sparse_tfidf.toarray(), columns=tfidfvec.get_feature_names(), index=df.index)
tfidf.head()

In [None]:
tfidf.max().sort_values(ascending=False).head(20)

In [None]:
tfidf['newsop_'] = df['news_opinion']
tfidf.head()

In [None]:
news = tfidf[tfidf['newsop_']=='News']
opinion = tfidf[tfidf['newsop_']=='Opinion']

news.max(numeric_only=True).sort_values(ascending=False).head(60)