# Political Leaning and Emotion Detectors

https://www.niemanlab.org/2020/01/republicans-and-democrats-live-in-nearly-inverse-news-media-environments-pew-finds/

https://newsapi.org/



In [135]:
import requests

url = ('http://newsapi.org/v2/sources?'
       'country=us&'
       'apiKey=8e213af13b7645518701df89f5cdbecc')

response = requests.get(url)
pew_sources = ['ABC News',
           'Breitbart News',
           'Business Insider',
           'CBS News',
           'CNN',
           'Fox News',
           'The Hill',
           'The Huffington Post',
           'NBC News',
           'Newsweek',
           'Politico',
           'MSNBC',
           'Time',
           'USA Today',
           'Vice News',
           'The Wall Street Journal',
           'The Washington Post']

pew_trust_scores = {
    'ABC News': -0.93,
    'Breitbart News': 2.51,
    'Business Insider': 1.90,
    'CBS News': -1.04,
    'CNN': -3.51,
    'Fox News': 7.60,
    'The Hill': 1.90,
    'The Huffington Post': 1.09,
    'NBC News': -1.37,
    'Newsweek': 0.91,
    'Politico': -1.48,
    'MSNBC': -2.90,
    'Time': -0.21,
    'USA Today': 1.00,
    'Vice News': 1.73,
    'The Wall Street Journal': 1.02,
    'The Washington Post': -1.86
}

json = response.json()

source_strs = []
for source in json['sources']:
    if source['name'] in pew_trust_scores:
        source_strs += [source['id']]

In [136]:
from datetime import date, timedelta

def results_from_source(curr_source):
    last_date = (date.today()-timedelta(days=29)).isoformat()
    page_num = '1'
    num_results = 100
    results = []
    while True:
        url = ('http://newsapi.org/v2/everything?'
               'sources=' + curr_source + '&'
               'pageSize=100&'
               'page=' + page_num + '&'
               'apiKey=8e213af13b7645518701df89f5cdbecc')
        response = requests.get(url)
        if 'articles' not in response.json():
            break
        results += response.json()['articles']
        page_num = str(int(page_num) + 1)

    return results

all_results = []
for source_name in source_strs:
    all_results += results_from_source(source_name)
    
all_results

[{'source': {'id': 'abc-news', 'name': 'ABC News'},
  'author': 'The Associated Press',
  'title': 'South Korea’s military: North fires unidentified projectile',
  'description': 'South Korea’s military says North Korea has fired an unidentified projectile',
  'url': 'https://abcnews.go.com/International/wireStory/south-koreas-military-north-fires-unidentified-projectile-69473149',
  'urlToImage': 'null',
  'publishedAt': '2020-03-08T23:05:09Z',
  'content': 'SEOUL, South Korea -- \r\nSouth Koreas military says North Korea has fired an unidentified projectile.\r\nSouth Koreas Joint Chiefs of Staff says it has detected the North Korean launch on Monday morning but gave no further details.\r\nThe launch came days after No… [+244 chars]'},
 {'source': {'id': 'abc-news', 'name': 'ABC News'},
  'author': 'The Associated Press',
  'title': 'South Korea’s military says North Korea has fired an unidentified projectile',
  'description': 'South Korea’s military says North Korea has fired an uni

In [148]:
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [138]:
tokenizer = TweetTokenizer()

def tokenize(headline):
    headline = headline.lower() #lower case
    headline = re.sub(r'\([0-9]/[0-9]\)', '', headline, flags=re.MULTILINE)
    headline = re.sub(r'[^A-Za-z0-9@# ]', '', headline, flags=re.MULTILINE) # remove punctuation
    headline = re.sub(r'  ', ' ', headline, flags=re.MULTILINE) # remove double spaces
    tokens = tokenizer.tokenize(headline) #tokenize
    if '' in tokens:
        tokens.remove('')
    return tokens
    
tokenized = []
for article in all_results:
    if article['title']:
        tokenized.append([tokenize(article['title']), pew_trust_scores[article['source']['name']]])

In [139]:
stop_words = set(stopwords.words('english'))
stop_words.add('us')

stop_words_removed = []
for headline in tokenized:
    filtered = [word for word in headline[0] if word not in stop_words]
    stop_words_removed += [[' '.join(filtered), headline[1]]]

In [140]:
processed_headlines = np.asarray(stop_words_removed)
np.random.shuffle(processed_headlines)

idx = 4 * len(processed_headlines) // 5

train_x = processed_headlines[:idx, 0]
train_y = processed_headlines[:idx, 1]

test_x = processed_headlines[idx:, 0]
test_y = processed_headlines[idx:, 1]

In [141]:
def get_unigram_data(x, y):
    assert(len(x) == len(y))
    unigram_x = []
    unigram_y = []
    for i in range(len(x)):
        unigram_x.append(x[i])
        unigram_y.append(y[i])

    return unigram_x, unigram_y

def get_bigram_data(x, y):
    bigram_x = []
    bigram_y = []
    for i in range(len(x)):
        words = x[i].split(' ')
        if len(words) < 2:
            continue
            
        bigrams = []
        for j in range(0, len(words) - 1):
            bigrams.append(words[j] + words[j + 1])
            
        bigram_x.append(' '.join(bigrams))
        bigram_y.append(y[i])

    return bigram_x, bigram_y


def get_trigram_data(x, y):
    trigram_x = []
    trigram_y = []
    for i in range(len(x)):
        words = x[i].split(' ')
        if len(words) < 3:
            continue
            
        trigrams = []
        for j in range(0, len(words) - 2):
            trigrams.append(words[j] + words[j + 1] + words[j + 2])

        trigram_x.append(' '.join(trigrams))
        trigram_y.append(y[i])

    return trigram_x, trigram_y

In [144]:
def vectorize_counts(train_x, test_x):
    split_idx = len(train_x)
    all_data = train_x + test_x

    count_vect = CountVectorizer()
    counts = count_vect.fit_transform(all_data)
    feat_dict=count_vect.vocabulary_.keys()

    x_train_counts = counts[:split_idx,:]
    x_test_counts = counts[split_idx:,:]
    
    return x_train_counts, x_test_counts

In [155]:
kf = KFold(n_splits=5)
nb_accuracies = []
lr_accuracies = []

for train_index, test_index in kf.split(train_x):
    x_fold_train, x_fold_test = train_x[train_index], train_x[test_index]
    y_fold_train, y_fold_test = train_y[train_index], train_y[test_index]
    print("ALL",len(x_fold_train), len(y_fold_train), len(x_fold_test),  len(y_fold_test))
    
    unigram_train_x, unigram_train_y = get_unigram_data(x_fold_train, y_fold_train)
    unigram_test_x, unigram_test_y = get_unigram_data(x_fold_test, y_fold_test)

    bigram_train_x, bigram_train_y = get_bigram_data(x_fold_train, y_fold_train)
    bigram_test_x, bigram_test_y = get_bigram_data(x_fold_test, y_fold_test)

    trigram_train_x, trigram_train_y = get_trigram_data(x_fold_train, y_fold_train)
    trigram_test_x, trigram_test_y = get_trigram_data(x_fold_test, y_fold_test)
    
    uni_bigram_train_x = unigram_train_x + bigram_train_x
    uni_bigram_train_y = unigram_train_y + bigram_train_y
    uni_bigram_test_x = unigram_test_x + bigram_test_x
    uni_bigram_test_y = unigram_test_y + bigram_test_y
    
    bi_trigram_train_x = bigram_train_x + trigram_train_x
    bi_trigram_train_y = bigram_train_y + trigram_train_y
    bi_trigram_test_x = bigram_test_x + trigram_test_x
    bi_trigram_test_y = bigram_test_y + trigram_test_y
    
    uni_trigram_train_x = unigram_train_x + trigram_train_x
    uni_trigram_train_y = unigram_train_y + trigram_train_y
    uni_trigram_test_x = unigram_test_x + trigram_test_x
    uni_trigram_test_y = unigram_test_y + trigram_test_y
    
    all_train_x = unigram_train_x + bigram_train_x + trigram_train_x
    all_train_y = unigram_train_y + bigram_train_y + trigram_train_y
    all_test_x = unigram_test_x + bigram_test_x + trigram_test_x
    all_test_y = unigram_test_y + bigram_test_y + trigram_test_y
    
    unigram_train_x, unigram_test_x = vectorize_counts(unigram_train_x, unigram_test_x)
    print("UNIGRAM", len(unigram_train_y), len(unigram_test_y))
    
    bigram_train_x, bigram_test_x = vectorize_counts(bigram_train_x, bigram_test_x)
    print("BIGRAM", len(bigram_train_y), len(bigram_test_y))
    
    trigram_train_x, trigram_test_x = vectorize_counts(trigram_train_x, trigram_test_x)
    print("TRIGRAM", len(trigram_train_y), len(trigram_test_y))
    
    uni_bigram_train_x, uni_bigram_test_x = vectorize_counts(uni_bigram_train_x, uni_bigram_test_x)
    print("UNIGRAM BIGRAM", len(uni_bigram_train_y), len(uni_bigram_test_y))
    
    uni_trigram_train_x, uni_trigram_test_x = vectorize_counts(uni_trigram_train_x, uni_trigram_test_x)
    print("UNIGRAM TRIGRAM", len(uni_trigram_train_y), len(uni_trigram_test_y))
    
    bi_trigram_train_x, bi_trigram_test_x = vectorize_counts(bi_trigram_train_x, bi_trigram_test_x)
    print("BIGRAM TRIGRAM", len(bi_trigram_train_y), len(bi_trigram_test_y))
    
    all_train_x, all_test_x = vectorize_counts(all_train_x, all_test_x)
    print("UNIGRAM BIGRAM TRIGRAM", len(all_train_y), len(all_test_y))
    
    naive_bayes_unigram = MultinomialNB().fit(unigram_train_x, unigram_train_y)
    unigram_predicted = naive_bayes_unigram.predict(unigram_test_x)
    
    naive_bayes_bigram = MultinomialNB().fit(bigram_train_x, bigram_train_y)
    bigram_predicted = naive_bayes_bigram.predict(bigram_test_x)

    naive_bayes_trigram = MultinomialNB().fit(trigram_train_x, trigram_train_y)
    trigram_predicted = naive_bayes_trigram.predict(trigram_test_x)

    naive_bayes_uni_bigram = MultinomialNB().fit(uni_bigram_train_x, uni_bigram_train_y)
    uni_bigram_predicted = naive_bayes_uni_bigram.predict(uni_bigram_test_x)

    naive_bayes_uni_trigram = MultinomialNB().fit(uni_trigram_train_x, uni_trigram_train_y)
    uni_trigram_predicted = naive_bayes_uni_trigram.predict(uni_trigram_test_x)

    naive_bayes_bi_trigram = MultinomialNB().fit(bi_trigram_train_x, bi_trigram_train_y)
    bi_trigram_predicted = naive_bayes_bi_trigram.predict(bi_trigram_test_x)

#     nb_accuracies.append([unigram_accuracy, bigram_accuracy, trigram_accuracy, uni_bigram_accuracy,\
#                           uni_trigram_accuracy, bi_trigram_accuracy])
    
    lr_unigram = LogisticRegression().fit(unigram_train_x, unigram_train_y)
    unigram_predicted = lr_unigram.predict(unigram_test_x)
    print("PRED", unigram_predicted)
    print("ACTUAL", unigram_test_y)
    
    lr_bigram = LogisticRegression().fit(bigram_train_x, bigram_train_y)
    bigram_predicted = lr_bigram.predict(bigram_test_x)

    lr_trigram = LogisticRegression().fit(trigram_train_x, trigram_train_y)
    trigram_predicted = lr_trigram.predict(trigram_test_x)

    lr_uni_bigram = LogisticRegression().fit(uni_bigram_train_x, uni_bigram_train_y)
    uni_bigram_predicted = lr_uni_bigram.predict(uni_bigram_test_x)
    
    lr_uni_trigram = LogisticRegression().fit(uni_trigram_train_x, uni_trigram_train_y)
    uni_trigram_predicted = lr_uni_trigram.predict(uni_trigram_test_x)
    
    lr_bi_trigram = LogisticRegression().fit(bi_trigram_train_x, bi_trigram_train_y)
    bi_trigram_predicted = lr_bi_trigram.predict(bi_trigram_test_x)
    
    lr_all = LogisticRegression().fit(all_train_x, all_train_y)
    all_predicted = lr_all.predict(all_test_x)
    
#     lr_accuracies.append([unigram_accuracy, bigram_accuracy, trigram_accuracy, uni_bigram_accuracy,\
#                           uni_trigram_accuracy, bi_trigram_accuracy, all_accuracy])
    

ALL 1022 1022 256 256
UNIGRAM 1022 256
BIGRAM 1019 255
TRIGRAM 1014 255
UNIGRAM BIGRAM 2041 511
UNIGRAM TRIGRAM 2036 511
BIGRAM TRIGRAM 2033 510
UNIGRAM BIGRAM TRIGRAM 3055 766




PRED ['1.9' '1.9' '-0.21' '1.73' '-1.37' '1.73' '-3.51' '-1.86' '2.51' '-3.51'
 '1.02' '7.6' '-1.48' '1.73' '-1.37' '1.9' '-0.93' '2.51' '-1.48' '1.9'
 '-1.37' '-1.48' '1.9' '1.9' '1.73' '2.51' '-1.37' '-1.04' '1.0' '-0.21'
 '1.9' '1.02' '1.9' '1.9' '1.9' '1.9' '1.9' '2.51' '7.6' '-1.04' '1.9'
 '-0.21' '1.9' '-3.51' '1.9' '1.73' '-2.9' '-0.21' '1.9' '-0.21' '1.9'
 '1.9' '1.9' '-1.37' '-1.37' '-3.51' '-1.86' '7.6' '-0.93' '-0.93' '-1.04'
 '1.9' '1.9' '1.9' '1.73' '-1.48' '1.73' '-0.93' '-1.37' '-1.04' '1.9'
 '1.9' '1.9' '-0.93' '-1.04' '1.0' '-3.51' '-1.86' '1.02' '1.9' '-1.04'
 '1.9' '1.9' '-1.86' '-1.04' '1.9' '1.02' '1.0' '-1.04' '-1.48' '-2.9'
 '1.9' '-2.9' '-0.93' '-1.04' '-2.9' '2.51' '0.91' '1.9' '1.9' '2.51'
 '1.9' '1.73' '1.9' '1.9' '2.51' '-1.37' '0.91' '1.02' '-1.37' '-1.37'
 '-0.93' '-1.48' '0.91' '0.91' '-1.48' '-1.04' '-0.93' '1.0' '-0.93'
 '0.91' '-1.48' '1.9' '1.9' '-3.51' '-1.86' '1.0' '-0.93' '-1.37' '-0.21'
 '-2.9' '-1.86' '1.9' '-0.21' '1.02' '-1.48' '1.73' '-1.04' '



PRED ['1.73' '7.6' '1.9' '1.9' '-1.04' '-1.48' '-3.51' '1.9' '-1.48' '-1.37'
 '1.9' '1.9' '1.9' '-1.04' '-1.48' '-1.86' '1.9' '-0.21' '-1.04' '1.0'
 '-3.51' '-1.48' '-1.86' '-0.93' '-0.93' '2.51' '1.9' '1.9' '1.73' '-0.21'
 '1.9' '-3.51' '-1.48' '7.6' '-0.93' '1.0' '-0.21' '-1.04' '1.73' '0.91'
 '1.02' '-2.9' '1.9' '-2.9' '-0.93' '1.02' '7.6' '1.9' '1.9' '-0.93' '1.9'
 '1.9' '2.51' '-3.51' '1.9' '7.6' '1.73' '-1.37' '2.51' '-1.86' '-1.48'
 '-3.51' '2.51' '1.73' '1.9' '1.9' '7.6' '2.51' '1.9' '1.9' '-1.48' '1.9'
 '1.0' '-1.48' '1.9' '-1.04' '-0.93' '-1.04' '-1.37' '1.73' '1.02' '-1.86'
 '1.9' '1.73' '1.9' '1.9' '0.91' '-1.48' '1.02' '1.02' '1.9' '-1.48'
 '0.91' '-1.86' '1.73' '-1.48' '1.9' '1.73' '1.9' '-3.51' '2.51' '1.9'
 '1.02' '1.9' '-3.51' '-0.93' '1.02' '-0.21' '1.9' '0.91' '1.9' '-1.37'
 '-2.9' '7.6' '-2.9' '1.9' '1.73' '1.73' '-1.37' '-0.93' '1.9' '-1.04'
 '-0.93' '1.9' '-1.48' '1.73' '0.91' '1.73' '1.9' '-1.48' '1.9' '-1.04'
 '1.0' '1.02' '-0.93' '-1.86' '0.91' '1.73' '-3.51' '



ALL 1023 1023 255 255
UNIGRAM 1023 255
BIGRAM 1019 255
TRIGRAM 1014 255
UNIGRAM BIGRAM 2042 510
UNIGRAM TRIGRAM 2037 510
BIGRAM TRIGRAM 2033 510
UNIGRAM BIGRAM TRIGRAM 3056 765




PRED ['-0.93' '-1.37' '1.9' '-1.37' '2.51' '-2.9' '-0.93' '1.9' '-1.37' '1.9'
 '1.9' '-3.51' '-1.37' '-3.51' '-0.93' '1.9' '-1.37' '1.9' '-3.51' '0.91'
 '-0.21' '-1.04' '-0.93' '1.9' '-1.37' '1.02' '1.73' '-3.51' '-1.04'
 '-1.37' '-1.48' '1.73' '1.9' '1.9' '-2.9' '-1.37' '-1.37' '-0.93' '1.9'
 '1.9' '-1.86' '1.73' '1.73' '1.9' '-3.51' '1.73' '1.73' '-3.51' '-1.48'
 '1.9' '-1.48' '2.51' '-1.37' '1.9' '7.6' '-3.51' '1.73' '-0.93' '-1.86'
 '1.02' '2.51' '-2.9' '-1.37' '-2.9' '-3.51' '-2.9' '1.9' '1.9' '-0.93'
 '-1.86' '1.0' '-0.21' '1.9' '-0.21' '1.73' '1.0' '1.0' '-1.37' '1.0'
 '-2.9' '-1.86' '-1.37' '-1.04' '2.51' '-2.9' '0.91' '1.73' '-1.86' '1.9'
 '2.51' '-1.37' '-0.93' '1.9' '1.9' '-1.86' '-1.37' '1.9' '-1.48' '-1.04'
 '-1.37' '1.9' '-1.04' '1.9' '1.0' '1.73' '1.9' '1.9' '-3.51' '1.9' '1.9'
 '1.9' '-1.04' '1.9' '-0.93' '-1.04' '-1.48' '1.9' '1.9' '-1.48' '0.91'
 '-1.48' '1.9' '-0.93' '-0.93' '2.51' '-0.21' '1.9' '-3.51' '-1.48' '1.02'
 '-1.04' '1.73' '1.9' '-1.37' '-2.9' '1.9' '-0.21



PRED ['1.9' '0.91' '-3.51' '1.9' '1.02' '-0.21' '-2.9' '1.9' '-2.9' '-3.51'
 '2.51' '-1.37' '1.02' '1.02' '1.0' '1.9' '1.73' '-2.9' '2.51' '1.9'
 '-1.04' '-1.37' '-1.48' '-0.93' '1.02' '-3.51' '1.9' '-3.51' '-1.37'
 '-3.51' '-2.9' '-1.37' '7.6' '-1.04' '1.9' '1.02' '-0.93' '1.02' '1.0'
 '2.51' '0.91' '-0.93' '1.02' '1.9' '-0.93' '1.9' '-0.93' '2.51' '1.73'
 '7.6' '-0.93' '1.9' '-3.51' '1.9' '-1.04' '1.9' '1.02' '-1.37' '-3.51'
 '1.02' '-0.93' '-0.21' '2.51' '-2.9' '-1.37' '1.73' '-0.93' '-0.21'
 '2.51' '-1.37' '-3.51' '-3.51' '0.91' '1.73' '-0.93' '1.9' '-0.21'
 '-1.86' '1.9' '1.9' '-1.48' '2.51' '-1.86' '7.6' '1.9' '1.73' '-3.51'
 '-1.37' '-2.9' '1.9' '2.51' '1.9' '1.0' '1.9' '-0.21' '-0.93' '1.9' '1.9'
 '1.02' '1.9' '1.9' '-1.37' '-0.21' '-1.04' '1.9' '1.9' '1.9' '-1.04'
 '1.9' '-2.9' '-0.93' '-1.48' '-1.04' '1.9' '1.0' '1.9' '1.9' '1.9' '1.9'
 '-1.04' '2.51' '1.9' '-3.51' '0.91' '1.02' '2.51' '-0.21' '1.02' '-1.86'
 '-3.51' '-0.93' '-1.48' '2.51' '0.91' '-1.48' '-0.93' '-2.9' '-3.51