## Diplomatic Event Rating Predictor
Uses a model generated by an adaption of the Scikit-Learn SVM classifier by Prof. Bengfort, trained on the newspaper articles published 7 days prior to the events.<br>
Pipeline for the model was: NLTK Preprocessor, TF-IDF Vectorizer, SGD Classifier<br>
See https://github.com/bbengfort/bbengfort.github.io/blob/master/_posts/2016-05-19-text-classification-nltk-sckit-learn.md and https://github.com/georgetown-analytics/New-Levant-Times/Analysis/NLTK_Rating_Classifier.ipynb
<br>Local files needed:<br>
1) RatingsandText_EventDate.csv, or csv in the format EventDate, FinalRating, Title, Text<br>
2) model pickle

In [16]:
import os
import string
import pickle
import pandas as pd
import numpy as np
from scipy import stats

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Define utilities

def timeit(func):
    """
    Simple timing decorator
    """
    def wrapper(*args, **kwargs):
        start  = time.time()
        result = func(*args, **kwargs)
        delta  = time.time() - start
        return result, delta
    return wrapper


def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

### NLTK Preprocessor
A custom stopword list is added to nltk standard 'english' stopwords

In [18]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        stoplist = sw.words('english')
        additionalsw = [u'one', u'first', u'two', u'second', u'three', u'third', u'four',
                        u'fourth', u'five', u'fifth', u'six', u'sixth', u'seven', u'seventh', 
                        u'eight', u'eighth', u'nine', u'ninth', u'ten', u'tenth', u'rsquo', 
                        u'rdquo', u'lsquo', u'ldquo', u'jan', u'january', u'feb', u'february',
                        u'mar', u'march', u'apr', u'april', u'may', u'jun', u'june', u'jul', u'july', 
                        u'aug', u'august', u'sept', u'september', u'oct', u'october', u'nov', 
                        u'november', u'dec', u'december']
        stoplist += additionalsw

        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(stopwords) if stopwords else set(stoplist)
        self.punct      = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

# Use the given model to predict event ratings for articles

## Set filenames and default values

In [19]:
# Event ratings and meanings
labeldict = { -1: 'unknown', 0: '0-friendly', 1: '1-neutral', 2: '2-threatening', 3: '3-aggressive'}

# Set to one of the positive values above to use a default rating
teameventrating = -1 

#filename or path of article text data csv in the format EventDate, FinalRating (= TeamRating), Title, Text
articledata = 'RatingsandText_EventDate.csv'

# model pickle name
PATH = "48Kmodel.pickle"

## Get the article text and final team ratings (team ratings might be unknown)
Multiple articles will be associated with a single event date and team rating

In [20]:
datatable = pd.read_csv(articledata, encoding='utf-8')
datatable.sort_values(by='EventDate')

datatable

Unnamed: 0,EventDate,FinalRating,Title,Text
0,2011-01-31,2,Iran's foreign minister meets Syrian president,Text of report in English by Iranian news chan...
1,2011-01-31,2,Analysis: Obama's State of the Union and US fo...,He will likely mention foreign issues and is u...
2,2011-01-31,2,Israel alert to spillover of violence in Lebanon,NORTHERN ISRAELI BORDER -- Security officials ...
3,2011-01-31,2,Canada gauging whether new Lebanon government ...,cclark@globeandmail.com How much of a banned t...
4,2011-01-31,2,Congress mulls aid cut if Hizbullah controls g...,WASHINGTON - Continuing US aid to Lebanon will...
5,2011-01-31,2,Bahrain calls for Arab summit to address regio...,Bahrain called Thursday for an Arab summit to ...
6,2011-01-31,2,Obama's gentle shift of policy comes with 'gre...,After decades of American policy predicated on...
7,2011-01-31,2,Obama administration and Egypt,A rare foreign-policy principals meeting will ...
8,2011-01-31,2,"In time of war, a monument to peace","Just steps from the Lincoln Memorial, a new bu..."
9,2011-03-11,3,Another reshuffle points to Sarkozy's weak pos...,WHEN FRENCH president Nicolas Sarkozy hastily ...


## Create dataframe for prediction
Combine the article titles and text for rating prediction with our model

In [21]:
data = pd.DataFrame(index=None, columns=['Date','Words','TeamRating','PredictedRating'])

for index, row in datatable.iterrows():
    if teameventrating > 0:
        rating = int(teameventrating)
    else:
        rating = int(row['FinalRating'])
    data = data.append({'Date': row['EventDate'],
                        'Words': row['Title'] + ' ' + row['Text'],
                       'TeamRating': labeldict[rating]},
                                      ignore_index=True)

print "Articles to process:",len(data)
data.head(20)

Articles to process: 5425


Unnamed: 0,Date,Words,TeamRating,PredictedRating
0,2011-01-31,Iran's foreign minister meets Syrian president...,2-threatening,
1,2011-01-31,Analysis: Obama's State of the Union and US fo...,2-threatening,
2,2011-01-31,Israel alert to spillover of violence in Leban...,2-threatening,
3,2011-01-31,Canada gauging whether new Lebanon government ...,2-threatening,
4,2011-01-31,Congress mulls aid cut if Hizbullah controls g...,2-threatening,
5,2011-01-31,Bahrain calls for Arab summit to address regio...,2-threatening,
6,2011-01-31,Obama's gentle shift of policy comes with 'gre...,2-threatening,
7,2011-01-31,Obama administration and Egypt A rare foreign-...,2-threatening,
8,2011-01-31,"In time of war, a monument to peace Just steps...",2-threatening,
9,2011-03-11,Another reshuffle points to Sarkozy's weak pos...,3-aggressive,


## Use the model pickle to run a prediction for each article
When this is complete, we will calculate the average and mode of article ratings per event

In [22]:
with open(PATH, 'rb') as f:
    model = pickle.load(f)

f.close()

i=0
while i < len(data):
    texttorate = data.loc[i, 'Words']
    answer = model.predict([texttorate])
    data.loc[i, 'PredictedRating'] = answer[0]
    if i > 0 and not i % 500:
        print "Predictions completed for ", i, "records"
    i += 1

print "Completed predictions:", i

Predictions completed for  500 records
Predictions completed for  1000 records
Predictions completed for  1500 records
Predictions completed for  2000 records
Predictions completed for  2500 records
Predictions completed for  3000 records
Predictions completed for  3500 records
Predictions completed for  4000 records
Predictions completed for  4500 records
Predictions completed for  5000 records
Completed predictions: 5425


In [23]:
# View the rating prediction for each article
data

Unnamed: 0,Date,Words,TeamRating,PredictedRating
0,2011-01-31,Iran's foreign minister meets Syrian president...,2-threatening,1
1,2011-01-31,Analysis: Obama's State of the Union and US fo...,2-threatening,2
2,2011-01-31,Israel alert to spillover of violence in Leban...,2-threatening,2
3,2011-01-31,Canada gauging whether new Lebanon government ...,2-threatening,2
4,2011-01-31,Congress mulls aid cut if Hizbullah controls g...,2-threatening,2
5,2011-01-31,Bahrain calls for Arab summit to address regio...,2-threatening,2
6,2011-01-31,Obama's gentle shift of policy comes with 'gre...,2-threatening,2
7,2011-01-31,Obama administration and Egypt A rare foreign-...,2-threatening,2
8,2011-01-31,"In time of war, a monument to peace Just steps...",2-threatening,2
9,2011-03-11,Another reshuffle points to Sarkozy's weak pos...,3-aggressive,3


## We're done with rating predictions for all articles. 
Calculate the article rating average and mode by event date, for comparison to the original team event rating

In [8]:
data['PredictedRating'] = data['PredictedRating'].astype(int)

individualdates = data.groupby(['Date','TeamRating'])

df = individualdates['PredictedRating'].agg([np.mean, stats.mode]).rename(columns={'mean':'Average','mode': 'Mode'})

df['Average'] = df['Average'].round(0).astype(int)
df[['Mode', 'Count']] = df['Mode'].apply(pd.Series)
df = df.drop('Count', 1)
df = df.rename(columns={'Average': 'Predicted (Average)', 'Mode': 'Predicted (Mode)'})

print "Total events to compare: ", len(df)

Total events to compare:  148


## Compare the original Team Rating to the Predicted Rating (as average or mode)
The average and/or the mode of the ratings by article should be close to the original team rating for that date.

In [9]:
print df

                          Predicted (Average) Predicted (Mode)
Date       TeamRating                                         
2011-01-31 2-threatening                    2              [2]
2011-03-11 3-aggressive                     3              [3]
2011-03-26 0-friendly                       0              [0]
2011-04-07 1-neutral                        1              [1]
2011-04-16 1-neutral                        1              [1]
2011-05-09 2-threatening                    2              [2]
2011-05-16 2-threatening                    2              [2]
2011-05-26 2-threatening                    2              [2]
2011-06-02 2-threatening                    2              [2]
2011-06-13 3-aggressive                     3              [3]
2011-06-20 1-neutral                        1              [1]
2011-07-02 2-threatening                    2              [2]
2011-07-11 2-threatening                    2              [2]
2011-07-22 2-threatening                    2          

In [11]:
# write predictions to file (if needed)

with open('OurRatingsVsPredictions.csv', 'w') as f_new:
    df.to_csv(f_new, header=True, index=True, encoding='utf-8')
    f_new.close()