# Political Leaning and Emotion Detectors

https://www.niemanlab.org/2020/01/republicans-and-democrats-live-in-nearly-inverse-news-media-environments-pew-finds/

https://newsapi.org/


We used a Pew Research study on partisan trust in news media sources to calculate each outlet's "trust score." This is calculated by subtracting the percentage of democrats who trust a news source from the percentage of republicans who trust a news source, then multiplying by the sum of the percent of democrats and republicans who reported for that source.

The values were then normalized and then adjusted to account for the fact that republican distrust among news media is higher than democrats, and divided to be a number scaled from -8, being the most extreme left, and +8, being the most extreme right.

We will get articles and headlines from the News API that we're using for our UI. Because we don't have the developper version, we're limited to 100 articles per news source, which significantly limits the accuracy of our regression, and we can do more extensive analysis with the entire aritcle as opposed to just headlines.

### Hitting the Sources API

We took the sources from the Pew Research study, hit the sources API and recorded all of the API id's into a list to reference later.

In [1]:
import requests

url = ('http://newsapi.org/v2/sources?'
       'country=us&'
       'apiKey=8e213af13b7645518701df89f5cdbecc')

response = requests.get(url)
pew_sources = ['ABC News',
           'Breitbart News',
           'Business Insider',
           'CBS News',
           'CNN',
           'Fox News',
           'The Hill',
           'The Huffington Post',
           'NBC News',
           'Newsweek',
           'Politico',
           'MSNBC',
           'Time',
           'USA Today',
           'Vice News',
           'The Wall Street Journal',
           'The Washington Post']

pew_trust_scores = {
    'ABC News': -0.93,
    'Breitbart News': 2.51,
    'Business Insider': 1.90,
    'CBS News': -1.04,
    'CNN': -3.51,
    'Fox News': 7.60,
    'The Hill': 1.90,
    'The Huffington Post': 1.09,
    'NBC News': -1.37,
    'Newsweek': 0.91,
    'Politico': -1.48,
    'MSNBC': -2.90,
    'Time': -0.21,
    'USA Today': 1.00,
    'Vice News': 1.73,
    'The Wall Street Journal': 1.02,
    'The Washington Post': -1.86
}

json = response.json()

source_strs = []
for source in json['sources']:
    if source['name'] in pew_trust_scores:
        source_strs += [source['id']]

### Getting Headlines

Because political context is really important, we are only using training data in a one month time period and labeling each tweet with its trust score.

In [2]:
from datetime import date, timedelta

def results_from_source(curr_source):
    last_date = (date.today()-timedelta(days=29)).isoformat()
    page_num = '1'
    num_results = 100
    results = []
    while True:
        url = ('http://newsapi.org/v2/everything?'
               'sources=' + curr_source + '&'
               'pageSize=100&'
               'page=' + page_num + '&'
               'apiKey=8e213af13b7645518701df89f5cdbecc')
        response = requests.get(url)
        if 'articles' not in response.json():
            break
        results += response.json()['articles']
        page_num = str(int(page_num) + 1)

    return results

all_results = []
for source_name in source_strs:
    all_results += results_from_source(source_name)

### Tokenizing and Feature Extraction

In [3]:
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [4]:
tokenizer = TweetTokenizer()

def tokenize(headline):
    headline = headline.lower() #lower case
    headline = re.sub(r'\([0-9]/[0-9]\)', '', headline, flags=re.MULTILINE)
    headline = re.sub(r'[^A-Za-z0-9 ]', '', headline, flags=re.MULTILINE) # remove punctuation
    headline = re.sub(r'  ', ' ', headline, flags=re.MULTILINE) # remove double spaces
    tokens = tokenizer.tokenize(headline) #tokenize
    if '' in tokens:
        tokens.remove('')
    return tokens
    
tokenized = []
for article in all_results:
    if article['title']:
        tokenized.append([tokenize(article['title']), pew_trust_scores[article['source']['name']]])

### Removing Stop Words

Removing stop words that will add noise to our classifier using the NLTK stopwords package.

In [5]:
stop_words = set(stopwords.words('english'))
stop_words.add('us')

stop_words_removed = []
for headline in tokenized:
    filtered = [word for word in headline[0] if word not in stop_words]
    stop_words_removed += [[' '.join(filtered), headline[1]]]

### Splitting the Data Set into Training and Testing

We will randomly shuffle all the data and then split the train/test data 80/20.

In [64]:
processed_headlines = np.asarray(stop_words_removed)
np.random.shuffle(processed_headlines)

idx = 4 * len(processed_headlines) // 5

train_x = processed_headlines[:idx, 0]
train_y = processed_headlines[:idx, 1]

test_x = processed_headlines[idx:, 0]
test_y = processed_headlines[idx:, 1]

### Helper Functions to Extract N-Gram Features

In [47]:
def get_unigram_data(x, y):
    assert(len(x) == len(y))
    unigram_x = []
    unigram_y = []
    for i in range(len(x)):
        unigram_x.append(x[i])
        unigram_y.append(y[i])

    return np.asarray(unigram_x).reshape(-1, 1), np.asarray(unigram_y).astype(float).reshape(-1, 1)

def get_bigram_data(x, y):
    bigram_x = []
    bigram_y = []
    for i in range(len(x)):
        words = x[i].split(' ')
        if len(words) < 2:
            continue
            
        bigrams = []
        for j in range(0, len(words) - 1):
            bigrams.append(words[j] + words[j + 1])
            
        bigram_x.append(' '.join(bigrams))
        bigram_y.append(y[i])

    return np.asarray(bigram_x).reshape(-1, 1), np.asarray(bigram_y).astype(float).reshape(-1, 1)


def get_trigram_data(x, y):
    trigram_x = []
    trigram_y = []
    for i in range(len(x)):
        words = x[i].split(' ')
        if len(words) < 3:
            continue
            
        trigrams = []
        for j in range(0, len(words) - 2):
            trigrams.append(words[j] + words[j + 1] + words[j + 2])

        trigram_x.append(' '.join(trigrams))
        trigram_y.append(y[i])

    return np.asarray(trigram_x).reshape(-1, 1), np.asarray(trigram_y).astype(float).reshape(-1, 1)

### Helper Function to Convert Headlines into Sparse Vectors

We may integrate this later should we choose to do this instead of encoding our strings.

In [48]:
def vectorize_counts(train_x, test_x):
    split_idx = len(train_x)
    all_data = train_x + test_x

    count_vect = CountVectorizer()
    counts = count_vect.fit_transform(all_data)
    feat_dict=count_vect.vocabulary_.keys()

    x_train_counts = counts[:split_idx,:]
    x_test_counts = counts[split_idx:,:]
    
    return x_train_counts, x_test_counts

### Encode Data and Perform KFold Cross Validation

In order to predict the float output, we will encode the strings and floats and then decode them in our predictions.

Moving forward I will likely try to simplify this process by changing trust scores to scaled integers from 0 to 100, so that we don't have to encode/decode. Also, I want to experiment with using bigram and trigram features on the headlines to see if that improves MSE. 

In [69]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

from sklearn.svm import SVC

for i in range(len(train_x)):
    train_x[i:,] = le.fit_transform(train_x[i:,])
    
train_y = le.fit_transform(train_y)

kf = KFold(n_splits=5)
nb_accuracies = []
lr_accuracies = []

for train_index, test_index in kf.split(train_x):
    x_fold_train, x_fold_test = train_x[train_index], train_x[test_index]
    y_fold_train, y_fold_test = train_y[train_index], train_y[test_index]

    unigram_train_x, unigram_train_y = get_unigram_data(x_fold_train, y_fold_train)
    unigram_test_x, unigram_test_y = get_unigram_data(x_fold_test, y_fold_test)

    bigram_train_x, bigram_train_y = get_bigram_data(x_fold_train, y_fold_train)
    bigram_test_x, bigram_test_y = get_bigram_data(x_fold_test, y_fold_test)

    trigram_train_x, trigram_train_y = get_trigram_data(x_fold_train, y_fold_train)
    trigram_test_x, trigram_test_y = get_trigram_data(x_fold_test, y_fold_test)
    
    svc_unigram = SVC(gamma='auto').fit(unigram_train_x, unigram_train_y)
    unigram_predicted = svc_unigram.predict(unigram_test_x).astype(int)
    decoded_pred = le.inverse_transform(unigram_predicted)
    decoded_actual = le.inverse_transform(unigram_test_y.astype(int))
    
    print ("MSE: ", np.sum((decoded_pred - decoded_actual)**2)/float(len(unigram_test_y)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


MSE:  35.62890625
MSE:  43.859375
MSE:  43.9140625
MSE:  38.7734375
MSE:  39.3125


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
