In [114]:
import numpy as np
import pandas as pd
import collections
import re

# Loading and Preparing Data


In [164]:
# Loading cleaned, labelled article data as a Pandas dataframe
data = pd.read_csv("Labelled_VR_data_Oct2020_Jan2021_wfulltext.csv")

In [165]:
# Spltting the article data by news and opinion label
news = data[data["Article Status"] == "News"]
opinion = data[data["Article Status"] == "Opinion"]

In [166]:
# Creating an array of news fulltext strings and another of opinion fulltext strings
news_fulltext_arr = np.array(news["Full Text"])
opinion_fulltext_arr = np.array(opinion["Full Text"])

# Determining Highly Predictive, Rule-Based Phrases

GOAL: Brainstorm highly predictive phrases to use a heuristics that would precede a feature-based classification step in the news vs opinion model

In [118]:
def news_op_diff(string, opinion_arr, news_arr):
    """Given an array of opinion articles, an array of news articles, and a search string,
       Find how many opinion articles vs news articles the string appears in."""
    # Count how many opinion articles string appears in
    opinion_count = 0
    for article in opinion_arr:
        if string in article.lower():
            opinion_count+=1
    # Count how many news articles string appears in
    news_count = 0
    for article in news_arr:
        if string in article.lower():
            news_count += 1
    # Report this data
    print('"' + string + '"' + " appears in " + str(opinion_count) + "/" + str(len(opinion_arr)) + " opinion articles and " 
          + str(news_count) + "/" + str(len(news_arr)) + " news articles")

In [119]:
# Initial suggestions for good discriminatory phrases
initial_try_words = ["opinion", "analysis", "guest columnist", "letter to the editor", "letters to the editor",
                    "editorial", "editors", "readers", "columnist", "special", "you"] + ["editor", "staff writer",
                    "staff writers","bureau", "news service", "contributing writer", "news group","bureau chief",
                    "contributed", "compiled by", "staff","editor in chief", "editor-in-chief", "managing editor", 
                    "political editor", "editor-at-large", "correspondent", "yesterday", "last", "say", "some", 
                    "company", "official", "plan", "here", "mr"]

# My (Daniel's) suggestions for good discriminatory phrases
daniels_try_words = ["editor at large", "editor-at-large", "opinion by", "follow him on twitter",
                "opinion piece", "the ap is solely responsible for this content", "this guide will",
                "analysis by", "news by", "opinion section"]

### Domain Research on Newspaper Publications

We suspect the most likely source of a strong predictive phrase has to do with the qualifications of the author, so I looked at the differences between reporters, editors, and columnists: <br />

https://customerservice.globe.com/hc/en-us/articles/360020398232-What-is-the-difference-between-a-reporter-editor-and-columnist-<br />

**Reporters**: "gathers facts and information... reporter is supposed to provide objective observation" (seems more like news)<br />

**Editors**: "assign reporters, decide which news events to cover, edit (revise)reporters' stories, decide what stories get published" (seems more like news)<br />

**Columnists**: "gives opinions, usually his or her own. A columnist is expected to gather accurate information, just as a reporter does, and then comment on that information." (more like opinion)


With this domain knowledge in mind, better search strings came out of the woodwork:

In [120]:
# Suggestions for good discriminatory phrases after research on how newspaper publication works
informed_words = ["reporter", "reporting by", "is a reporter", "reporter at", "reporter for",
                  "editor", "editing by", "is an editor", "editor at", "editor for", "editorial",
                  "columnist", "is a columnist", "columnist at", "columnist for", "guest columnist",
                  "publisher", "publication", "this publication", 
                  "op-ed", "letters to the editor", "letter to the editor", "opposite editorial",
                  "guest essay", "my opinion", "opinion columnist", "op-ed columnist",
                  "opinion section", "this column", "opinion:", "produced by", "special thanks", 
                  "we'd like to hear"]

# Words that had the highest difference in frequency between news and opinion articles
good_words = ["reporting by", "is a reporter", "reporter at", "editing by", "is a columnist",
             "columnist at", "colunmnist for", "guest columnist", "letters to the editor",
             "op-ed columnist", "opinion section", "this column", "opinion:", "we'd like to hear"]

# Words that both had a high difference in frequency between neews and opinion articles
# and occured enough that small sample size was not a concern
best_words = ["reporting by","editing by"]

In [121]:
# Show results for predictive power of the discriminatory phrases we came up with
for try_word in best_words:
    news_op_diff(try_word, opinion_fulltext_arr, news_fulltext_arr)

"reporting by" appears in 1/606 opinion articles and 152/2945 news articles
"editing by" appears in 1/606 opinion articles and 147/2945 news articles


## Setting Up Shiny's Feature Extraction Functions

In [123]:
def sent_len(art_str):
    """
    Returns the average sentence length measured in tokens (inverted).
    """
    dots = [p for p in range(len(art_str)) if art_str[p] == "."]
    sentences = [dots[i+1] - dots[i] for i in range(len(dots) - 1)]
    return (1/np.average(sentences))

In [124]:
def token_len(art_str):
    """
    Returns the average token length measured in characters (inverted).
    """
    wordList = re.sub("[^\w]", " ",  art_str).split()
    return 1/(np.average([len(w) for w in wordList]))

In [125]:
negations = ["no", "not", "none", "no one", "nobody", "neither", "nowhere", "nothing", "never"]
neg_suffix = "n't"

def negations_count(art_str):
    wordList = re.sub("[^\w]", " ",  art_str).split()
    total = 0
    for w in wordList:
        if w.lower() in negations:
            total += 1
    return total

In [126]:
def punctuation_count(art_str):
    """
    Determines the numbers of exclamation marks, question marks, semicolons, and commas,
    as compared to other punctuation symbols. 
    """
    count = [0] * 5
    for i in range(len(art_str)):
        if art_str[i] == "?":
            count[0] += 1
        elif art_str[i] == "!":
            count[1] += 1
        elif art_str[i] == ",":
            count[2] += 1
        elif art_str[i] == ";":
            count[3] += 1
        elif art_str[i] == ".":
            count[4] += 1
    return count

In [127]:
casual = ['after', 'because', 'insofar as', 'by', 'in turn', 'for', 'once', 'as a result', 'hence', 'in the end', 
          'by then', 'but', 'subsequently', 'as', 'therefore', 'unless', 'thus', 'accordingly', 'so that', 'since', 
          'consequently', 'indeed', 'ultimately', 'then', 'even though', 'now that', 'finally”,”hence”,”if', 'although', 
          'so', 'thereby', 'otherwise', 'due to', 'and', 'when']

contrastive = ['nor', 'in fact', 'despite', 'equally', 'by comparison', 'contrast', 'by contrast', 'but', 'separately', 
               'whereas', 'rather', 'meanwhile', 'also', 'even so', 'though', 'unlike', 'however', 'or',  
               'nevertheless', 'yet', 'even though', 'conversely', 'nonetheless', 'on the contrary', 'in contrast', 'while', 
               'likewise', 'instead', 'although', 'on the other hand', 'still', 'similarly', 'otherwise', 'actually', 
               'alternatively', 'on the one hand']

temporal = ["before", "after", "next", "shortly", "afterwards", "eventually", "firstly", "secondly", "previously", "meanwhile",
            "finally", "while", "then", "earlier", "when", "initially", "soon", "suddenly", "until", "once", "recently", "already", "as"]

expansive = ["also", "and", "as well as", "besides", "in addition", "furthermore", "in fact", "moreover", "additionally",
             "too", "further", "or", "neither", "nor", "either"]

def connective_count(art_str):
    # casual, contrastive, temporal, expansive
    connectives = [0] * 4
    wordList = re.sub("[^\w]", " ",  art_str).split()
    for w in wordList:
        if w.lower() in casual:
            connectives[0] += 1
        elif w.lower() in contrastive:
            connectives[1] += 1
        elif w.lower() in temporal:
            connectives[2] += 1
        elif w.lower() in expansive:
            connectives[3] += 1
    return connectives

In [128]:
first_person = ['I', 'we', 'our', 'ourselves', 'us', 'me', 'my', 'mine', 'myself']

second_person = ['you', 'yours', 'your', 'yourself', 'yourselves']

third_person = ['he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
                'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']

def pronouns_count(art_str):
    wordList = re.sub("[^\w]", " ",  art_str).split()
    pronoun_count = [0] * 3
    for w in wordList:
        if w.lower() in first_person:
            pronoun_count[0] += 1
        elif w.lower() in second_person:
            pronoun_count[1] += 1
        elif w.lower() in third_person:
            pronoun_count[2] += 1
    return pronoun_count

In [129]:
def find_citation(art_str):
    """
    Returns the citation length and frequency in the article. 
    """
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes])
    return [num_citations, avg_citation_len]

In [130]:
modals = ["can", "must", "may", "could", "might", "should", "would", "shall", "ought to"]
vos = ["announce", "claim", "declare", "explain", "insist", "mention", "exclaim", "state", "say", "said"]
future_will = "will"

In [131]:
def count_digits(art_str):
    """
    Returns the frequency of digits in a text. 
    """
    return len(re.findall("[\d]",art_str))/len(art_str)

In [132]:
# Shiny's feature extraction function
def get_all_features(art_str):
    counter = collections.Counter(art_str.lower().split())
    num_words = sum([counter.get(w) for w in counter.keys()])
    
    questions, exclamations, semicolons, commas, periods = punctuation_count(art_str)
    first_p, second_p, third_p = 0, 0, 0
    num_modals, num_vos = 0, 0
    num_casual, num_temporal, num_contrastive, num_expansive = 0, 0, 0, 0
    digits = 0
    num_future = counter.get("will")
    opinion_count, news_count = 0, 0
    num_negation, negation_suffix = 0, 0
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes])
    sent_length = sent_len(art_str)
    token_length = token_len(art_str)
    
    for w in counter.keys():
        if w in vos:
            num_vos += counter.get(w)
        elif w in first_person:
            first_p += counter.get(w)
        elif w in second_person:
            second_p += counter.get(w)
        elif w in third_person:
            third_p += counter.get(w)
        elif w in modals:
            num_modals += counter.get(w)
        elif w in casual:
            num_casual += counter.get(w)
        elif w in temporal:
            num_temporal += counter.get(w)
        elif w in contrastive:
            num_contrastive += counter.get(w)
        elif w in expansive:
            num_expansive += counter.get(w)
        elif any(char.isdigit() for char in w):
            digits += 1
        elif w in opinion:
            opinion_count += counter.get(w)
        elif w in news:
            news_count += counter.get(w)
        elif "n't" in w:
            negation_suffix += counter.get(w)
        elif w in negations:
            num_negation += counter.get(w)
            
    ling_features = [sent_length, token_length, first_p, second_p, third_p, 
                    questions, exclamations, semicolons, commas, periods, 
                    num_casual, num_temporal, num_contrastive, num_expansive, 
                    digits, num_modals, num_vos, num_future, 
                    opinion_count, news_count,
                    num_negation, negation_suffix, num_citations, avg_citation_len, num_words]
    
    return ling_features

In [133]:
# Preparing to create a dataframe where each record is a full text, and each column is a feature

# Collecting the label for each record in an array
y = []
for article_label in data["Article Status"]:
    y.append(article_label)
y = np.array(y)

# Collecting the features for each record in an array
X = []
for article in data["Full Text"]:
    article_features = get_all_features(article)
    X.append(article_features)
X = np.array(X)
 

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [134]:
# Creating a reference for the features and target variables we are considering
X_cols = ['sent_length', 'token_length', 'first_p', 'second_p', 'third_p', 
                    'questions', 'exclamations', 'semicolons', 'commas', 'periods', 
                    'num_casual', 'num_temporal', 'num_contrastive', 'num_expansive', 
                    'digits', 'num_modals', 'num_vos', 'num_future', 
                    'opinion_count', 'news_count',
                    'num_negation', 'negation_suffix', 'num_citations', 'avg_citation_len', 'num_words']
y_col = ['art_status']

In [135]:
# Creating a dataframe where each record is a full text, and each column is a feature

data_wfeats = pd.DataFrame(X, columns=X_cols)
data_wfeats['art_status'] = y
data_wfeats.head()

Unnamed: 0,sent_length,token_length,first_p,second_p,third_p,questions,exclamations,semicolons,commas,periods,...,num_vos,num_future,opinion_count,news_count,num_negation,negation_suffix,num_citations,avg_citation_len,num_words,art_status
0,0.0149,0.21707,1,2,44,0,0,78,3,133,...,8,6,0,0,8,0,0,,1514,News
1,0.007716,0.200762,9,3,51,1,0,92,3,71,...,2,2,0,0,10,2,41,65.195122,1456,News
2,0.021549,0.224333,1,0,21,0,0,30,1,80,...,0,3,0,0,2,0,0,,632,Opinion
3,0.007221,0.196564,5,0,20,0,0,31,1,36,...,1,3,0,0,6,2,5,113.4,784,Opinion
4,0.006043,0.197957,1,0,20,0,0,50,0,36,...,11,2,0,0,8,0,0,,945,News


In [136]:
# Dropping records from this dataframe where feature values are missing
data_wfeats = data_wfeats.dropna()
data_wfeats.head()

#FIXME oh my that's quite a lot lost

Unnamed: 0,sent_length,token_length,first_p,second_p,third_p,questions,exclamations,semicolons,commas,periods,...,num_vos,num_future,opinion_count,news_count,num_negation,negation_suffix,num_citations,avg_citation_len,num_words,art_status
1,0.007716,0.200762,9,3,51,1,0,92,3,71,...,2,2,0,0,10,2,41,65.195122,1456,News
3,0.007221,0.196564,5,0,20,0,0,31,1,36,...,1,3,0,0,6,2,5,113.4,784,Opinion
5,0.009358,0.20407,15,4,9,0,0,27,1,38,...,2,9,0,0,9,1,1,18.0,667,Opinion
6,0.010215,0.196864,0,2,25,1,0,66,0,88,...,6,2,0,0,2,3,8,30.75,1318,News
7,0.014028,0.203455,1,0,0,0,0,5,0,8,...,0,1,0,0,1,0,2,28.0,103,Opinion


In [137]:
from sklearn.model_selection import train_test_split

# Train test split for articles in our dataset
train, test = train_test_split(data_wfeats, test_size=0.30, random_state=42)

In [154]:
# Defining the data matrix and label vector for train and test data sets
X_train = train[X_cols].to_numpy()
y_train = train[y_col].to_numpy().flatten()

X_test = test[X_cols].to_numpy()
y_test = test[y_col].to_numpy().flatten()

In [155]:
# Fitting a support vector classifier on X_train and y_train
svc = svm.SVC(kernel='linear')
svc.fit(X_train, y_train)

SVC(kernel='linear')

In [163]:
from sklearn.metrics import f1_score

# Evaluating SVM performance on test data
y_pred = svc.predict(X_test)
y_true = y_test

# Initial f1 score
f1_score(y_true, y_pred, average='macro')

0.7649685727283699