In [1]:
import pandas as pd
import numpy as np
import collections
import re

# spacy code below: 
# import spacy
# nlp = spacy.load("en_core_web_sm")

data = pd.read_csv("Labelled_VR_data_Oct2020_Jan2021_wfulltext.csv")

# Common Opinion and News Terms (Additional Features)

In [2]:
#possible opinion terms

opinion = ["opinion", 
           "analysis",  
           "guest columnist", 
           "letter to the editor", 
           "letters to the editor",
           "editorial", 
           "editor",
           "column",
           "editorial board",
           "editors", 
           "readers", 
           "columnist", 
           "special", 
           "you", 
          ]

#possible news terms

news = ["editor", 
        "staff writer",
        "staff writers",
        "bureau", 
        "news",
        "news service", 
        "contributing writer",   
        "news group",
        "bureau chief",
        "contributed", 
        "compiled by", 
        "staff",
        "editor in chief", 
        "editor-in-chief", 
        "managing editor", 
        "political editor", 
        "editor-at-large", 
        "correspondent", 
        "yesterday", 
        "last", 
        "say"
        "some", 
        "company", 
        "official", 
        "plan",
        "here",
        "mr",
        "associated press", 
        "reuters", 
        "report"
       ]


# Linguistic Features

In [3]:
#linguistic features —  SENTLENGTH and TOKENLENGTH
#hypothesis being that opinion texts, such as editorials, tend to feature longer sentences 
#art_str is the article as a String

def sent_len(art_str):
    """
    Returns the average sentence length measured in tokens (inverted).
    """
    dots = [p for p in range(len(art_str)) if art_str[p] == "."]
    sentences = [dots[i+1] - dots[i] for i in range(len(dots) - 1)]
    return (1/np.average(sentences)) if sentences else (1/len(art_str))
    
def token_len(art_str):
    """
    Returns the average token length measured in characters (inverted).
    """
    wordList = art_str.split()
    lengths = [len(w) for w in wordList]
    return 1/(np.average(lengths)), np.sum(lengths)


In [4]:
#linguistic features — NEGATION and NEGATIONSUFFIX 
#NEGATION — OPINION
#NEGATIONSUFFIX — NEWS
#hypothesis being that opinion texts tend to have more negations 
#n't is used commonly in citations or quotes

negations = ["no", "not", "none", "no one", "nobody", "neither", "nowhere", "nothing", "never"]
neg_suffix = "n\'t"


In [5]:
def negations_count(art_str):
    wordList = art_str.lower().split()
    total, suffix_total = 0, 0
    for w in wordList:
        if w in negations:
            total += 1
        if w[-3:] == neg_suffix:
            suffix_total += 1
    return total, suffix_total
    

In [6]:
#linguistic features — QUESTIONS, EXCLAMATIONS, COMMAS, and SEMICOLONS
#hypothesis being that opinion texts tend to use more exclamations and (rhetorical) questions
#exclamation marks, question marks, semicolons, and commas
#QUESTIONS, EXCLAMATIONS, COMMAS, SEMICOLONS — OPINION
#COMMAS — NEWS
 
def punctuation_count(art_str):
    """
    Determines the numbers of exclamation marks, question marks, semicolons, and commas,
    as compared to other punctuation symbols. 
    """
    count = [0] * 5
    for i in range(len(art_str)):
        if art_str[i] == "?":
            count[0] += 1
        elif art_str[i] == "!":
            count[1] += 1
        elif art_str[i] == ",":
            count[2] += 1
        elif art_str[i] == ";":
            count[3] += 1
        elif art_str[i] == ".":
            count[4] += 1
    return count


In [7]:
#linguistic features — CONNECTIVES (Temporal, Casual, Contrastive, Expansive)
#hypothesis being that there are more connectives in news (aftermath of study)
#casual, expansive, temporal, contrastive — NEWS 


casual = ['after', 'because', 'insofar as', 'by', 'in turn', 'for', 'once', 'as a result', 'hence', 'in the end', 
          'by then', 'but', 'subsequently', 'as', 'therefore', 'unless', 'thus', 'accordingly', 'so that', 'since', 
          'consequently', 'indeed', 'ultimately', 'then', 'even though', 'now that', 'finally”,”hence”,”if', 'although', 
          'so', 'thereby', 'otherwise', 'due to', 'and', 'when']

contrastive = ['nor', 'in fact', 'despite', 'equally', 'by comparison', 'contrast', 'by contrast', 'but', 'separately', 
               'whereas', 'rather', 'meanwhile', 'also', 'even so', 'and', 'though', 'if', 'unlike', 'however', 'or', 'then', 
               'nevertheless', 'yet', 'even though', 'conversely', 'nonetheless', 'on the contrary', 'in contrast', 'while', 
               'likewise', 'instead', 'although', 'on the other hand', 'still', 'similarly', 'otherwise', 'actually', 
               'alternatively', 'on the one hand', 'when']

temporal = ["before", "after", "next", "shortly", "afterwards", "eventually", "firstly", "secondly", "previously", "meanwhile",
            "finally", "while", "then", "earlier", "when", "initially", "soon", "suddenly", "until", "once", "recently", "already", "as"]

expansive = ["also", "and", "as well as", "besides", "in addition", "furthermore", "in fact", "moreover", "additionally",
             "too", "further", "or", "neither", "nor", "either"]


In [8]:
def connective_count(art_str):
    # casual, contrastive, temporal, expansive
    connectives = [0] * 4
    wordList = art_str.lower().split()
    for w in wordList:
        if w in casual:
            connectives[0] += 1
        elif w in contrastive:
            connectives[1] += 1
        elif w in temporal:
            connectives[2] += 1
        elif w in expansive:
            connectives[3] += 1
    return connectives


In [9]:
#linguistic features — PRONOUNS outside of quotes — OPINION
#study used first and second person only


first_person = ['I', 'we', 'our', 'ourselves', 'us', 'me', 'my', 'mine', 'myself']
second_person = ['you', 'yours', 'your', 'yourself', 'yourselves']
third_person = ['he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
                'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves']


In [10]:
def pronouns_count(art_str):
    wordList = art_str.lower().split()
    pronoun_count = [0] * 3
    for w in wordList:
        if w in first_person:
            pronoun_count[0] += 1
        elif w in second_person:
            pronoun_count[1] += 1
        elif w in third_person:
            pronoun_count[2] += 1
    return pronoun_count
    

In [11]:
#linguistic features — CITATIONS and CITATIONLENGTH — OPINION
#hypothesis being that higher frequencies of citations are more indicative of opinion (aftermath of study)

def find_citation(art_str):
    """
    Returns the citation length and frequency in the article. 
    """
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes])
    return [num_citations, avg_citation_len]


In [12]:
#linguistic features — MODALS — OPINION
#hypothesis being that modal verbs are expected to be a better indicator for opinion 
#according to study, is less potent but do correspond to the genre as expected

modals = ["can", "must", "may", "could", "might", "should", "would", "shall", "ought to"]


In [13]:
#linguistic features — VERBSofSAYING (VoS) — NEWS
#hypothesis that vos more common in news

vos = ['acknowledge','affirm','allege','announce','assert','claim','comment','contend','declare','disclose',
       'exclaim','explain','insist','mention','notify','observe','proclaim','propose','report','reveal','said','say','state',
       'stipulate','tell','write']


In [14]:
#linguistic features — FUTURE_WILL — NEWS
#frequency of the verb, "will" outside of quotes

future_will = "will"


In [15]:
#linguistic features — DIGITS — OPINION
#hypothesis being that digits are more frequent in opinion than news (aftermath of study)

def count_digits(art_str):
    """
    Returns the frequency of digits in a text. 
    """
    return len(re.findall("[\d]+",art_str))/len(art_str)


In [16]:
#complexity and finite verbs

def get_finite_verbs(art_str):
    finite_verb_tags = ["VBD", "VBP", "VBZ", "MD", "BES", "HVS"]
    return [w for w in nlp(art_str) if w.tag_ in finite_verb_tags]

def calc_complexity(art_str):
    num_finite_verbs = len(get_finite_verbs(art_str))
    complexity = num_finite_verbs / token_len(art_str)[1]
    return complexity


In [17]:
#present and past tense frequency

def present_tense_freq(finite_verbs):
    present_verbs = [w for w in finite_verbs if w.tag_ == "VBZ" or w.tag_ == "VBP"]
    present_tense_frequency = len(present_verbs) / len(finite_verbs)
    return present_tense_frequency

def past_tense_freq(finite_verbs):
    past_verbs = [w for w in finite_verbs if w.tag_ == "VBD"]
    past_tense_frequency = len(past_verbs) / len(finite_verbs)
    return past_tense_frequency


In [18]:
#interjection frequency

def get_interjections(art_str):
    interjections = [w for w in nlp(art_str) if w.pos_ == "INTJ"]
    intj_freq = len(interjections) / token_len(art_str)[1]
    return intj_freq


In [19]:
# Subjectivity Dictionary
import json
mpqa_dict = json.load(open("mpqa_dict.json", "r"))


In [20]:
#linguistic features — SENTIMENT — OPINION
#hypothesis being that opinion texts employ a less neutral language
#calculated in study using MPQA Subjectivity Clues Lexicon 

def get_sentiment(art_str, mpqa_dict):
    subjectivity, adjectives = 0, 0
    words = art_str.lower().split()
    
    for w in words:
        if w in mpqa_dict:  
            if mpqa_dict[w]['pos'] == 'adj':
                adjectives += 1
            if mpqa_dict[w]['subj'] == 'weaksubj':
                subjectivity += 0.1
            if mpqa_dict[w]['subj'] == 'strongsubj':
                subjectivity += 1
          
    num_words = len(words)
    sentiment = subjectivity / num_words
    adj_ratio = adjectives / num_words
    
    return subjectivity, sentiment, adjectives, adj_ratio


In [21]:
def get_all_features(art_str):
    counter = collections.Counter(art_str.lower().split())
    num_words = sum([counter.get(w) for w in counter.keys()])
    questions, exclamations, semicolons, commas, periods = punctuation_count(art_str)
    first_p, second_p, third_p = 0, 0, 0
    num_modals, num_vos = 0, 0
    num_casual, num_temporal, num_contrastive, num_expansive = 0, 0, 0, 0
    digits = 0
    num_future = counter.get("will") or 0
    opinion_count, news_count = 0, 0
    num_negation, negation_suffix = 0, 0
    quotes = re.findall(r'"(.*?)"', art_str)
    within_quotes = collections.Counter(" ".join(quotes).lower().split())
    num_citations = len(quotes)
    avg_citation_len = np.average([len(q) for q in quotes]) if quotes else 0
    sent_length = sent_len(art_str)
    token_length = token_len(art_str)[0]
    #finite_verbs = get_finite_verbs(art_str)
    #complexity = calc_complexity(art_str)
    #present_freq = present_tense_freq(finite_verbs)
    #past_freq = past_tense_freq(finite_verbs)
    #interjection_freq = get_interjections(art_str)
    
    for w in counter.keys():
        if w in vos:
            num_vos += counter.get(w)
        elif w in first_person:
            first_p += counter.get(w)
        elif w in second_person:
            second_p += counter.get(w)
        elif w in third_person:
            third_p += counter.get(w)
        elif w in modals:
            num_modals += counter.get(w)
        elif w in casual:
            num_casual += counter.get(w)
        elif w in temporal:
            num_temporal += counter.get(w)
        elif w in contrastive:
            num_contrastive += counter.get(w)
        elif w in expansive:
            num_expansive += counter.get(w)
        elif any(char.isdigit() for char in w):
            digits += 1
        elif w in opinion:
            opinion_count += counter.get(w)
        elif w in news:
            news_count += counter.get(w)
        elif "n't" in w:
            negation_suffix += counter.get(w)
        elif w in negations:
            num_negation += counter.get(w)
    
    first_second_p = first_p + second_p
            
    ling_features = [sent_length, token_length, first_p, second_p, third_p, first_second_p, 
                    questions, exclamations, semicolons, commas, periods, 
                    num_casual, num_temporal, num_contrastive, num_expansive, 
                    digits, num_modals, num_vos, num_future, 
                    opinion_count, news_count,
                    num_negation, negation_suffix, num_citations, avg_citation_len, num_words]
                    # complexity, present_freq, past_freq, interjection_freq]
        
    return ling_features


In [22]:
# David's Additional Features Code

#These Journalist Name terms are only found in opinion articles

df = data
name_only_opinion = ["opinion",
                    "letters? to the editor",
                    "letters\:",
                    "editorial board",
                    "readers", 
                    "columnist"]

#These Headline terms are only found in opinion articles
headline_only_opinion = ["letters? to the editor",
                        "letters\:", 
                        "columnist"]

#These Full Text terms are only found in opinion articles
fulltext_only_opinion = ["letters? to the editor"]


#These Journalist Name terms are only found in news articles
name_only_news = ["contributed",
            "bureau",  
            "compiled by"]

#These Full Text terms are only found in news articles
fulltext_only_news = ["\(ap\)",
                    "contributed to this report"]

#Format text fields for searching
df["fulltext_lower"] = df["Full Text"].fillna("").str.lower()
df["journo_name_lower"] = df["Journalist Name"].fillna("").str.lower()
df["headline_lower"] = df["Headline"].fillna("").str.lower()


#Create column for articles matching news-only regex patterns
#or associated press in name field where headline does not equal 'editorial roundup'
df["news_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_news), regex=True))|
                           (df["journo_name_lower"].str.contains("|".join(name_only_news), regex=True))|
                           ((df["journo_name_lower"].str.contains("associated press")) & 
                            -(df["headline_lower"].str.contains("editorial roundup")))), 
                          "Present", "Not present")

#Create column for articles matching opinion-only regex patterns
df["opinion_rule"] = np.where(((df["fulltext_lower"].str.contains("|".join(fulltext_only_opinion), regex=True))|
                              (df["journo_name_lower"].str.contains("|".join(name_only_opinion), regex=True))|
                               (df["headline_lower"].str.contains("|".join(headline_only_opinion), regex=True))),
                              "Present", "Not present")

#Code to check results of rule-based classification
#df.groupby(["opinion_rule","news_rule","news_opinion"]).size()

#These terms appear more often in opinion articles
name_lean_opinion = ["editor"]

headline_lean_opinion = ["opinion", 
                          "editor",
                          "editorial",
                          "column"]

fulltext_lean_opinion = ["editorial board",
                         "columnist"]


#These terms appear more often in news articles
name_lean_news = ["staff writer",
            "staff", 
            "news"]

headline_lean_news = ["news"]

fulltext_lean_news = ["associated press",
            "reuters",
            "staff writer",
            "staff writers",
            "staff", 
            "bureau", 
            "contributed", 
            "compiled by", 
            "correspondent"]
def create_feature_from_terms(term_list, column, df=df):
    '''(List, series, df -> df) loop through (lower-case) list of terms, check for presence of that term in the specified column, 
    and create feature column where 1 denotes presence of the term'''
    
    for item in term_list:
        df[item + "_" + column + "_" + "feature"] = np.where(df[column].fillna("").str.lower().str.contains(item), 1, 0)
    
    return df
#Create dummy-coded (0/1) feature columns from terms lists
df = create_feature_from_terms(name_lean_opinion, "Media Name")
df = create_feature_from_terms(headline_lean_opinion, "Headline")
df = create_feature_from_terms(fulltext_lean_opinion, "Full Text")
df = create_feature_from_terms(name_lean_news, "Media Name")
df = create_feature_from_terms(headline_lean_news, "Headline")
df = create_feature_from_terms(fulltext_lean_news, "Full Text")
#Additional features

#Dateline feature
#News articles sometimes start with datelines, which are ALL CAPS
#This checks if the article starts with at least 3 all caps letters
df["upper_start_feature"] = np.where(df["Full Text"].str.contains('^[A-Z]{3,20} ', regex=True), 1, 0)

#MediaName feature
#The Hill, Associated Press and Reuters are all mainly news articles
df["media_lean_news_feature"] = np.where(df["Media Name"].fillna("").str.contains("Reuters|Associated Press|thehill|The Hill", regex=True), 1, 0)

#Headline length feature
#On average, news headlines are slightly longer than opinion headlines
#df.groupby(["news_opinion"])["headline_length"].mean()
df["headline_length_feature"] = df["Headline"].str.len()

#Author count feature
#News articles tend to have slightly more authors
#df.groupby(["news_opinion","author_count"]).size()
df["author_count_feature"] = df["Cleaned Author"].str.count(", ") + 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
a = df["Headline"].str.len()
b = df["Cleaned Author"].str.count(", ") + 1
zipped = list(zip(a,b))
fitted = scaler.fit(zipped)
arr = scaler.transform(zipped)
t = zip(*arr)
new = list(t)
df['minmax_length'] = new[0]
df['minmax_author'] = new[1]
m1 = max(df["headline_length_feature"])
normalized1 = df["headline_length_feature"]/m1
df['normalized_length'] = normalized1
m2 = max(df["author_count_feature"])
normalized2 = df["author_count_feature"]/m2
df['normalized_author_count'] = normalized2
df

Unnamed: 0,artdate,Article ID,Headline,Article Status,Article Position,Media Name,Journalist Name,Article Issues,Custom Group,Full Text,...,compiled by_Full Text_feature,correspondent_Full Text_feature,upper_start_feature,media_lean_news_feature,headline_length_feature,author_count_feature,minmax_length,minmax_author,normalized_length,normalized_author_count
0,1/31/21,29470814,Democratic Party Enters 2021 in Power — and Fl...,News,Neutral,The New York Times,By Shane Goldmacher,,South Carolina,The Democratic National Committee has a roughl...,...,0,0,0,0,73,1.0,0.142232,0.00,0.156989,0.2
1,1/31/21,29458845,"A call for another Great Migration, this time ...",News,Neutral,The Washington Post,Carlos Lozada,,Georgia,A Black Power Manifesto By Charles M. Blow. Ha...,...,0,0,0,0,56,1.0,0.105033,0.00,0.120430,0.2
2,1/31/21,29474652,Trump Raised $255.4 Million in 8 Weeks as He S...,News,Neutral,The New York Times,By Shane Goldmacher and Rachel Shorey,"VR: Anti-Voter Policies, VR: Civic Participati...",Commentary,The former president’s fund-raising slowed sig...,...,0,0,0,0,79,2.0,0.155361,0.25,0.169892,0.4
3,1/31/21,29484873,Democrats are faced with a choice. Protect the...,Opinion,Positive,Washington Post.com,By E.J. Dionne Jr.,"VR: Pro-Voter Policies, H.R. 1, VR: Anti-Voter...",Commentary,The Democrats can use their House and Senate m...,...,0,0,0,0,79,1.0,0.155361,0.00,0.169892,0.2
4,1/31/21,29487011,GOP lawmakers seek tougher voting rules after ...,News,Neutral,Associated Press Newswires,By ANTHONY IZAGUIRRE and ACACIA CORONADO,,Georgia,"AUSTIN, Texas (AP) — Republican lawmakers in s...",...,0,0,0,1,60,2.0,0.113786,0.25,0.129032,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,10/1/20,19978767,"County, AG take vote spar to state's top court...",News,Neutral,Houston Chronicle,Zach Despart; Staff writer,,Texas,Harris County Clerk Christopher Hollins' plan ...,...,0,0,0,0,109,,0.221007,,0.234409,
3531,10/1/20,19943666,Dawn Porter on Documenting Rep. John Lewis: 'T...,News,Neutral,The Root,"No by-line,",VR: Anti-Voter Policies,Georgia,It’s been a few months since the world lost on...,...,0,0,0,0,99,,0.199125,,0.212903,
3532,10/1/20,19884512,Debate has little substance for voters; Lack o...,News,Neutral,The Arizona Republic,By Yvonne Wingett Sanchez and Ronald J. Hansen...,,Arizona,If Arizona voters tuned in to the first presid...,...,0,0,0,0,87,2.0,0.172867,0.25,0.187097,0.4
3533,10/1/20,19933963,Elecciones 2020: Lo que necesitas saber para v...,News,Neutral,The Arizona Daily Star,Veronica M. Cruz,VR: Civic Participation,Arizona,La elección general es el 3 de noviembre\n\nPo...,...,0,0,0,0,78,1.0,0.153173,0.00,0.167742,0.2


In [23]:
new[0]

(0.14223194748358864,
 0.1050328227571116,
 0.15536105032822756,
 0.15536105032822756,
 0.11378555798687089,
 0.39168490153172864,
 0.350109409190372,
 0.019693654266958426,
 0.13785557986870894,
 0.07439824945295405,
 0.08096280087527352,
 0.10284463894967177,
 0.11378555798687089,
 0.12035010940919037,
 0.2297592997811816,
 0.14223194748358864,
 0.09190371991247265,
 0.2013129102844639,
 0.1334792122538293,
 0.0962800875273523,
 0.11378555798687089,
 0.12035010940919037,
 0.2013129102844639,
 0.11159737417943107,
 0.13785557986870894,
 0.1334792122538293,
 0.12035010940919037,
 0.14660831509846828,
 0.11378555798687089,
 0.1312910284463895,
 0.10940919037199125,
 0.087527352297593,
 0.10722100656455141,
 0.31291028446389496,
 0.0612691466083151,
 0.14223194748358864,
 0.2472647702407002,
 0.13566739606126915,
 0.09190371991247265,
 0.09190371991247265,
 0.14660831509846828,
 0.10722100656455141,
 0.11816192560175055,
 0.11159737417943107,
 0.08315098468271334,
 0.14442013129102843,
 

In [24]:
def get_other_features(record):
    """
    Returns other features besides linguistic features.
    """
    headline_length = record["headline_length_feature"]
    author_count = record["author_count_feature"]
    minmax_length = record["minmax_length"]
    minmax_author = record["minmax_author"]
    normalized_length = record["normalized_length"]
    normalized_author_count = record["normalized_author_count"]
    return [headline_length, author_count, minmax_length, minmax_author, normalized_length,
                    normalized_author_count]

# Setting Up Data

In [25]:
news = data[data["Article Status"] == "News"]
opinion = data[data["Article Status"] == "Opinion"]


In [26]:
y_vals = []
for art_label in data["Article Status"]:
    y_vals.append(art_label)
y_vals = np.array(y_vals)

x_vals = []
for index, record in data.iterrows():
    art_str = record["Full Text"]
    article_features = get_all_features(art_str) 
    x_vals.append(article_features)
x_vals = np.array(x_vals)


In [27]:
ling_features_names = ['sent_length', 'token_length', 'first_p', 'second_p', 'third_p', 'first_second_p', 
                    'questions', 'exclamations', 'semicolons', 'commas', 'periods', 
                    'num_casual', 'num_temporal', 'num_contrastive', 'num_expansive', 
                    'digits', 'num_modals', 'num_vos', 'num_future', 
                    'opinion_count', 'news_count',
                    'num_negation', 'negation_suffix', 'num_citations', 'avg_citation_len', 'num_words']

# features requiring spacy
# 'complexity', 'present_freq', 'past_freq', 'interjection_freq'

# other_feature_names = ['headline_length', 'author_count, minmax_length', 'minmax_author', 'normalized_length', 'normalized_author_count']

class_names = ['Opinion', 'News']

# all_feature_names = ling_features_names + other_feature_names

In [28]:
data_features = pd.DataFrame(x_vals, columns = ling_features_names)
data_features['Article Status'] = y_vals
data_features.head()

Unnamed: 0,sent_length,token_length,first_p,second_p,third_p,first_second_p,questions,exclamations,semicolons,commas,...,num_vos,num_future,opinion_count,news_count,num_negation,negation_suffix,num_citations,avg_citation_len,num_words,Article Status
0,0.0149,0.201974,1.0,2.0,44.0,3.0,0.0,0.0,78.0,3.0,...,9.0,6.0,0.0,0.0,8.0,0.0,0.0,0.0,1514.0,News
1,0.007716,0.190626,9.0,3.0,51.0,12.0,1.0,0.0,92.0,3.0,...,2.0,2.0,0.0,0.0,10.0,2.0,41.0,65.195122,1456.0,News
2,0.021549,0.200954,1.0,0.0,21.0,1.0,0.0,0.0,30.0,1.0,...,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,632.0,News
3,0.007221,0.18756,5.0,0.0,20.0,5.0,0.0,0.0,31.0,1.0,...,1.0,3.0,0.0,0.0,6.0,2.0,5.0,113.4,784.0,Opinion
4,0.006043,0.187835,1.0,0.0,20.0,1.0,0.0,0.0,50.0,0.0,...,13.0,2.0,0.0,0.0,8.0,0.0,0.0,0.0,945.0,News


## Naive Bayes Model

In [29]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [30]:
from sklearn import preprocessing
def encode_feature(array):
    """
    Encodes the features. 
    """
    encoder = preprocessing.LabelEncoder()
    encoder.fit(array)
    return encoder.transform(array)


# News as 0 and Opinion as 1
targets = encode_feature(y_vals)

In [31]:
X = data_features.drop(['Article Status','num_words'], axis = 1)
X['minmax_length'] = new[0]
X

Unnamed: 0,sent_length,token_length,first_p,second_p,third_p,first_second_p,questions,exclamations,semicolons,commas,...,num_modals,num_vos,num_future,opinion_count,news_count,num_negation,negation_suffix,num_citations,avg_citation_len,minmax_length
0,0.014900,0.201974,1.0,2.0,44.0,3.0,0.0,0.0,78.0,3.0,...,5.0,9.0,6.0,0.0,0.0,8.0,0.0,0.0,0.000000,0.142232
1,0.007716,0.190626,9.0,3.0,51.0,12.0,1.0,0.0,92.0,3.0,...,18.0,2.0,2.0,0.0,0.0,10.0,2.0,41.0,65.195122,0.105033
2,0.021549,0.200954,1.0,0.0,21.0,1.0,0.0,0.0,30.0,1.0,...,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.000000,0.155361
3,0.007221,0.187560,5.0,0.0,20.0,5.0,0.0,0.0,31.0,1.0,...,17.0,1.0,3.0,0.0,0.0,6.0,2.0,5.0,113.400000,0.155361
4,0.006043,0.187835,1.0,0.0,20.0,1.0,0.0,0.0,50.0,0.0,...,9.0,13.0,2.0,0.0,0.0,8.0,0.0,0.0,0.000000,0.113786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3530,0.008160,0.197435,4.0,0.0,22.0,4.0,1.0,0.0,31.0,0.0,...,12.0,14.0,3.0,0.0,0.0,7.0,1.0,6.0,94.166667,0.221007
3531,0.007276,0.210563,8.0,4.0,25.0,12.0,0.0,1.0,33.0,0.0,...,11.0,5.0,2.0,0.0,0.0,5.0,0.0,0.0,0.000000,0.199125
3532,0.011149,0.197375,3.0,4.0,33.0,7.0,4.0,0.0,73.0,0.0,...,8.0,14.0,1.0,0.0,0.0,14.0,6.0,23.0,85.739130,0.172867
3533,0.009121,0.190674,0.0,0.0,0.0,0.0,0.0,0.0,28.0,1.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.000000,0.153173


In [33]:
len(targets) == len(X)

True

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=0.2, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 707 points : 127


In [35]:
Accuracy = (X_test.shape[0] - (y_test != y_pred).sum())/X_test.shape[0]
Accuracy

0.8203677510608204

In [36]:
from sklearn.inspection import permutation_importance
imps = permutation_importance(gnb, X_test, y_test)
print(imps.importances_mean)

[ 2.26308345e-03  1.69731259e-03  9.90099010e-03  2.82885431e-04
 -8.48656294e-04  2.54596888e-03  1.64073550e-02  1.41442716e-03
 -1.13154173e-03  5.37482320e-03 -1.69731259e-03 -2.26308345e-03
  0.00000000e+00  1.41442716e-03  2.22044605e-17  4.80905233e-03
  1.21640736e-02  1.98019802e-02  6.78925035e-03  0.00000000e+00
  0.00000000e+00 -2.82885431e-03  4.52616690e-03  1.01838755e-02
  2.82885431e-03  4.80905233e-03]


In [37]:
# HELPFUL LINK
# https://stackoverflow.com/questions/62933365/how-to-get-the-feature-importance-in-gaussian-naive-bayes