In [45]:
import numpy as np
import pandas as pd
import regex as re
import collections
import spacy
nlp = spacy.load("en_core_web_sm")

# Data

In [46]:
# Loading cleaned, labelled article data as a Pandas dataframe
data = pd.read_excel("Labelled_VR_data_Oct2020_Jan2021_wfulltext.xlsx")

In [47]:
# Converting article status strings to article status indicators.
# 1 = Opinion, 0 = News
def indicator(art_status):
    if art_status == "News":
        return 0
    elif art_status == "Opinion":
        return 1
    else:
        print("Warning: art_status is neither news nor opinion")
        return -1
    
# Applying article indicator conversion function to the dataframe    
data["Article Status Int"] = data["Article Status"].apply(indicator)

In [48]:
# Spltting the article data by news and opinion label
news = data[data["Article Status"] == "News"]
opinion = data[data["Article Status"] == "Opinion"]

# Shiny's Features

In [49]:
#linguistic features —  SENTLENGTH and TOKENLENGTH
#hypothesis being that opinion texts, such as editorials, tend to feature longer sentences 
#art_str is the article as a String

def sent_len(art_str):
    """
    Returns the average sentence length measured in tokens (inverted).
    """
    dots = [p for p in range(len(art_str)) if art_str[p] == "."]
    sentences = [dots[i+1] - dots[i] for i in range(len(dots) - 1)]
    return (1/np.average(sentences)) if sentences else (1/len(art_str))
    
def token_len(art_str):
    """
    Returns the average token length measured in characters (inverted).
    """
    wordList = art_str.split()
    lengths = [len(w) for w in wordList]
    return 1/(np.average(lengths)), np.sum(lengths)

In [50]:
#linguistic features — NEGATION and NEGATIONSUFFIX 
#NEGATION — OPINION
#NEGATIONSUFFIX — NEWS
#hypothesis being that opinion texts tend to have more negations 
#n't is used commonly in citations or quotes

negations = ["no", "not", "none", "no one", "nobody", "neither", "nowhere", "nothing", "never"]
neg_suffix = "n\'t"

In [51]:
def negations_count(art_str):
    wordList = art_str.lower().split()
    total, suffix_total = 0, 0
    for w in wordList:
        if w in negations:
            total += 1
        if w[-3:] == neg_suffix:
            suffix_total += 1
    return total, suffix_total

In [52]:
#linguistic features — QUESTIONS, EXCLAMATIONS, COMMAS, and SEMICOLONS
#hypothesis being that opinion texts tend to use more exclamations and (rhetorical) questions
#exclamation marks, question marks, semicolons, and commas
#QUESTIONS, EXCLAMATIONS, COMMAS, SEMICOLONS — OPINION
#COMMAS — NEWS
 
def punctuation_count(art_str):
    """
    Determines the numbers of exclamation marks, question marks, semicolons, and commas,
    as compared to other punctuation symbols. 
    """
    count = [0] * 5
    for i in range(len(art_str)):
        if art_str[i] == "?":
            count[0] += 1
        elif art_str[i] == "!":
            count[1] += 1
        elif art_str[i] == ",":
            count[2] += 1
        elif art_str[i] == ";":
            count[3] += 1
        elif art_str[i] == ".":
            count[4] += 1
    return count

In [53]:
#linguistic features — CONNECTIVES (Temporal, Casual, Contrastive, Expansive)
#hypothesis being that there are more connectives in news (aftermath of study)
#casual, expansive, temporal, contrastive — NEWS 


casual = ['after', 'because', 'insofar as', 'by', 'in turn', 'for', 'once', 'as a result', 'hence', 'in the end', 
          'by then', 'but', 'subsequently', 'as', 'therefore', 'unless', 'thus', 'accordingly', 'so that', 'since', 
          'consequently', 'indeed', 'ultimately', 'then', 'even though', 'now that', 'finally”,”hence”,”if', 'although', 
          'so', 'thereby', 'otherwise', 'due to', 'and', 'when']

contrastive = ['nor', 'in fact', 'despite', 'equally', 'by comparison', 'contrast', 'by contrast', 'but', 'separately', 
               'whereas', 'rather', 'meanwhile', 'also', 'even so', 'and', 'though', 'if', 'unlike', 'however', 'or', 'then', 
               'nevertheless', 'yet', 'even though', 'conversely', 'nonetheless', 'on the contrary', 'in contrast', 'while', 
               'likewise', 'instead', 'although', 'on the other hand', 'still', 'similarly', 'otherwise', 'actually', 
               'alternatively', 'on the one hand', 'when']

temporal = ["before", "after", "next", "shortly", "afterwards", "eventually", "firstly", "secondly", "previously", "meanwhile",
            "finally", "while", "then", "earlier", "when", "initially", "soon", "suddenly", "until", "once", "recently", "already", "as"]

expansive = ["also", "and", "as well as", "besides", "in addition", "furthermore", "in fact", "moreover", "additionally",
             "too", "further", "or", "neither", "nor", "either"]

In [54]:
def connective_count(art_str):
    # casual, contrastive, temporal, expansive
    connectives = [0] * 4
    wordList = art_str.lower().split()
    for w in wordList:
        if w in casual:
            connectives[0] += 1
        elif w in contrastive:
            connectives[1] += 1
        elif w in temporal:
            connectives[2] += 1
        elif w in expansive:
            connectives[3] += 1
    return connectives

In [55]:
#linguistic features — PRONOUNS outside of quotes — OPINION
#study used first and second person only


first_person = ['I', 'we', 'our', 'ourselves', 'us', 'me', 'my', 'mine', 'myself']
second_person = ['you', 'yours', 'your', 'yourself', 'yourselves']
third_person = ['he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
                'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']

In [56]:
def pronouns_count(art_str):
    wordList = re.sub("[^\w]", " ",  art_str).split()
    pronoun_count = [0] * 4
    for w in wordList:
        """0: 1st person
           1: 2nd person
           2: 3rd person
           3: 1st and 2nd person"""
        w_lower = w.lower()
        if w_lower in first_person:
            pronoun_count[0] += 1
            pronoun_count[3] += 1
        elif w_lower in second_person:
            pronoun_count[1] += 1
            pronoun_count[3] += 1
        elif w_lower in third_person:
            pronoun_count[2] += 1
    return pronoun_count

In [57]:
#linguistic features — CITATIONS and CITATIONLENGTH — OPINION
#hypothesis being that higher frequencies of citations are more indicative of opinion (aftermath of study)

def find_citation(art_str):
    """
    Returns the citation length and frequency in the article. 
    """
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes])
    return [num_citations, avg_citation_len]

In [58]:
#linguistic features — MODALS — OPINION
#hypothesis being that modal verbs are expected to be a better indicator for opinion 
#according to study, is less potent but do correspond to the genre as expected

modals = ["can", "must", "may", "could", "might", "should", "would", "shall", "ought to"]

In [59]:
#linguistic features — VERBSofSAYING (VoS) — NEWS
#hypothesis that vos more common in news

vos = ['acknowledge','affirm','allege','announce','assert','claim','comment','contend','declare','disclose',
       'exclaim','explain','insist','mention','notify','observe','proclaim','propose','report','reveal','said','say','state',
       'stipulate','tell','write']

In [60]:
#linguistic features — FUTURE_WILL — NEWS
#frequency of the verb, "will" outside of quotes

future_will = "will"

In [61]:
#linguistic features — DIGITS — OPINION
#hypothesis being that digits are more frequent in opinion than news (aftermath of study)

def count_digits(art_str):
    """
    Returns the frequency of digits in a text. 
    """
    return len(re.findall("[\d]+",art_str))/len(art_str)

In [62]:
finite_verb_tags = ["VBD", "VBP", "VBZ", "MD", "BES", "HVS"]

In [63]:
#complexity and finite verbs

def get_finite_verbs(art_str):
    finite_verb_tags = ["VBD", "VBP", "VBZ", "MD", "BES", "HVS"]
    return [w for w in nlp(art_str) if w.tag_ in finite_verb_tags]

def calc_complexity(art_str):
    num_finite_verbs = len(get_finite_verbs(art_str))
    complexity = num_finite_verbs / token_len(art_str)[1]
    return complexity

In [64]:
#present and past tense frequency

def present_tense_freq(finite_verbs):
    present_verbs = [w for w in finite_verbs if w.tag_ == "VBZ" or w.tag_ == "VBP"]
    present_tense_frequency = len(present_verbs) / len(finite_verbs)
    return present_tense_frequency

def past_tense_freq(finite_verbs):
    past_verbs = [w for w in finite_verbs if w.tag_ == "VBD"]
    past_tense_frequency = len(past_verbs) / len(finite_verbs)
    return past_tense_frequency

In [65]:
#interjection frequency

def get_interjections(art_str):
    interjections = [w for w in nlp(art_str) if w.pos_ == "INTJ"]
    intj_freq = len(interjections) / token_len(art_str)[1]
    return intj_freq

In [66]:
# Subjectivity Dictionary
import json
mpqa_dict = json.load(open("mpqa_dict.json", "r"))

print(len(mpqa_dict))

6778


In [67]:
#linguistic features — SENTIMENT — OPINION
#hypothesis being that opinion texts employ a less neutral language
#calculated in study using MPQA Subjectivity Clues Lexicon 

def get_sentiment(art_str, mpqa_dict):
    subjectivity, adjectives = 0, 0
    words = art_str.lower().split()
    
    for w in words:
        if w in mpqa_dict:  
            if mpqa_dict[w]['pos'] == 'adj':
                adjectives += 1
            if mpqa_dict[w]['subj'] == 'weaksubj':
                subjectivity += 0.1
            if mpqa_dict[w]['subj'] == 'strongsubj':
                subjectivity += 1
          
    num_words = len(words)
    sentiment = subjectivity / num_words
    adj_ratio = adjectives / num_words
    
    return subjectivity, sentiment, adjectives, adj_ratio

In [68]:
quote_pattern = "'[^']*'"

# David's Features

In [69]:
# Cleaning string-based columns by filling na values with empty strings
data["Full Text"].fillna("");
data["Journalist Name"].fillna("");
data["Headline"].fillna("");

In [70]:
#Headline length feature
#On average, news headlines are slightly longer than opinion headlines
#df.groupby(["news_opinion"])["headline_length"].mean()
data["headline_length_feature"] = data["Headline"].str.len()

#Author count feature
#News articles tend to have slightly more authors
#df.groupby(["news_opinion","author_count"]).size()
data["author_count_feature"] = data["Cleaned Author"].str.count(", ") + 1

In [71]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
a = data["Headline"].str.len()
b = data["Cleaned Author"].str.count(", ") + 1
zipped = list(zip(a,b))
fitted = scaler.fit(zipped)
arr = scaler.transform(zipped)
t = zip(*arr)
new = list(t)
data['minmax_length'] = new[0]
data['minmax_author'] = new[1]

In [72]:
data['minmax_length'] = data['minmax_length'].fillna(0);
data['minmax_author'] = data['minmax_author'].fillna(0);

In [73]:
m1 = max(data["headline_length_feature"])
normalized1 = data["headline_length_feature"]/m1
data['normalized_length'] = normalized1
m2 = max(data["author_count_feature"])
normalized2 = data["author_count_feature"]/m2
data['normalized_author_count'] = normalized2

In [74]:
data['normalized_length'] = data['normalized_length'].fillna(0);
data['normalized_author_count'] = data['normalized_author_count'].fillna(0);

# Combined Feature Extraction Function

In [75]:
def get_all_features(record, df):
    
    # Features with no for loops
    art_str = record["Full Text"]
    minmax_length = record["minmax_length"]
    minmax_author = record["minmax_author"]
    normalized_length = record["normalized_length"]
    normalized_author_count = record["normalized_author_count"]
    
    counter = collections.Counter(art_str.lower().split())
    num_words = sum(list(counter.values()))
    
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes]) if quotes else 0
    sent_length = sent_len(art_str)
    token_length, num_tokens = token_len(art_str) 
    
    # Things we need to count
    num_modals, num_vos = 0, 0
    num_casual, num_temporal, num_contrastive, num_expansive = 0, 0, 0, 0
    first_p, second_p, third_p, first_second_p = 0, 0, 0, 0
    num_future = counter.get("will") or 0
    opinion_count, news_count = 0, 0
    num_negation, negation_suffix = 0, 0
    num_finite_verbs = 0
    digits = 0
    num_past_tense, num_pres_tense = 0, 0
    num_intjs = 0
    
    # Features with for loops
    for w in counter.keys():
        if w in vos:
            num_vos += counter.get(w)
        if w in first_person:
            first_p += counter.get(w)
            first_second_p += counter.get(w)
        if w in second_person:
            second_p += counter.get(w)
            first_second_p += counter.get(w)
        if w in third_person:
            third_p += counter.get(w)
        if w in modals:
            num_modals += counter.get(w)
        if w in casual:
            num_casual += counter.get(w)
        if w in temporal:
            num_temporal += counter.get(w)
        if w in contrastive:
            num_contrastive += counter.get(w)
        if w in expansive:
            num_expansive += counter.get(w)
        if any(char.isdigit() for char in w):
            digits += 1
        if w in opinion:
            opinion_count += counter.get(w)
        if w in news:
            news_count += counter.get(w)
        if "n't" in w:
            negation_suffix += counter.get(w)
        if w in negations:
            num_negation += counter.get(w)
    
    for w in nlp(art_str):
        if w.tag_ in finite_verb_tags:
            num_finite_verbs += 1
        if w.tag_ == "VBZ" or w.tag_ == "VBP":
            num_pres_tense += 1
        if w.tag_ == "VBD":
            num_past_tense += 1
    
    # Other 
    if num_tokens == 0:
        comlexity, interjection_freq = 0, 0
    else: 
        complexity = num_finite_verbs / num_tokens
        interjection_freq = num_intjs / num_tokens
    if num_finite_verbs == 0:
         past_freq, present_freq = 0, 0
    else:
        past_freq = num_past_tense / num_finite_verbs
        present_freq = num_pres_tense / num_finite_verbs
    
    subjectivity, sentiment, adjectives, adj_ratio = get_sentiment(art_str, mpqa_dict)
    questions, exclamations, semicolons, commas, periods = punctuation_count(art_str)
    
    # Compilation
    ling_features = [sent_length, token_length, first_p, second_p, third_p, first_second_p,
                    questions, exclamations, semicolons, commas, periods, 
                    num_casual, num_temporal, num_contrastive, num_expansive, 
                    digits, num_modals, num_vos, num_future, 
                    opinion_count, news_count,
                    num_negation, negation_suffix, num_citations, avg_citation_len, num_words,
                    subjectivity, sentiment, adjectives, adj_ratio,
                    minmax_length, minmax_author, normalized_length, normalized_author_count,
                    complexity, present_freq, past_freq, interjection_freq]
    return ling_features

# Feature Extraction Demonstration

In [76]:
# Preparing to create a dataframe where each record is a full text, and each column is a feature

# Collecting the label for each record in an array
y = []
count = 1
total = len(data)
for article_label in data["Article Status Int"]:
    y.append(article_label)
y = np.array(y)

# Collecting the features for each record in an array
X = []
for index, record in data.iterrows():
    article_features = get_all_features(record, data)
    X.append(article_features)
    
    if count == 101: # Stopper so that we only extract the features of the first 101 records. (For time's sake)
        break
    if count%100 == 0:
        print(str(count) + " of " + str(total))
    count += 1
    
X = np.array(X)

100 of 3535


In [77]:
# Creating a reference for the features and target variables we are considering
X_cols = ['sent_length', 'token_length', 'first_p', 'second_p', 'third_p', "first/second_p",
                    'questions', 'exclamations', 'semicolons', 'commas', 'periods', 
                    'num_casual', 'num_temporal', 'num_contrastive', 'num_expansive', 
                    'digits', 'num_modals', 'num_vos', 'num_future', 
                    'opinion_count', 'news_count',
                    'num_negation', 'negation_suffix', 'num_citations', 'avg_citation_len', 'num_words',
                    'subjectivity', 'sentiment', 'adjectives', 'adj_ratio',
                    'minmax_length', 'minmax_author', 'normalized_length', 'normalized_author_count', 
                    'complexity', 'present', 'past', 'interjections']
y_col = ['art_status']

In [78]:
# Creating a dataframe where each record is a full text, and each column is a feature
data_wfeats = pd.DataFrame(X, columns=X_cols).head(101) # We're only considering the first 101 records for this demo
data_wfeats = data_wfeats[X_cols]
data_wfeats['art_status'] = y[:101] # Accounting for us only considering the first 101 records for this demo
data_wfeats

Unnamed: 0,sent_length,token_length,first_p,second_p,third_p,first/second_p,questions,exclamations,semicolons,commas,...,adj_ratio,minmax_length,minmax_author,normalized_length,normalized_author_count,complexity,present,past,interjections,art_status
0,0.014900,0.201974,1.0,2.0,44.0,3.0,0.0,0.0,78.0,3.0,...,0.036988,0.142232,0.00,0.156989,0.2,0.018143,0.316176,0.602941,0.0,0
1,0.007716,0.190626,9.0,3.0,51.0,12.0,1.0,0.0,92.0,3.0,...,0.051511,0.105033,0.00,0.120430,0.2,0.017806,0.720588,0.125000,0.0,0
2,0.021549,0.200954,1.0,0.0,21.0,1.0,0.0,0.0,30.0,1.0,...,0.022152,0.155361,0.25,0.169892,0.4,0.015262,0.125000,0.770833,0.0,0
3,0.007221,0.187560,5.0,0.0,20.0,5.0,0.0,0.0,31.0,1.0,...,0.044643,0.155361,0.00,0.169892,0.2,0.013397,0.446429,0.178571,0.0,1
4,0.006043,0.187835,1.0,0.0,20.0,1.0,0.0,0.0,50.0,0.0,...,0.025397,0.113786,0.25,0.129032,0.4,0.016696,0.523810,0.321429,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.007687,0.189315,36.0,0.0,14.0,36.0,0.0,0.0,55.0,0.0,...,0.036209,0.056893,0.00,0.073118,0.2,0.014516,0.611111,0.180556,0.0,1
97,0.006857,0.199559,9.0,1.0,57.0,10.0,0.0,0.0,106.0,2.0,...,0.029633,0.089716,0.00,0.105376,0.2,0.021550,0.553488,0.283721,0.0,0
98,0.007088,0.206884,25.0,1.0,38.0,26.0,3.0,0.0,55.0,1.0,...,0.039405,0.087527,0.00,0.103226,0.2,0.020290,0.482143,0.267857,0.0,1
99,0.007906,0.191883,0.0,0.0,34.0,0.0,0.0,0.0,38.0,1.0,...,0.024551,0.122538,0.00,0.137634,0.0,0.015764,0.379310,0.494253,0.0,0
