# Model Predictions

In [1]:
import pandas as pd
import numpy as np
import pickle
from gensim.parsing.porter import PorterStemmer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

## Defining Functions

In [2]:
def preprocess(data, stem_data, remove_stopwords):
    processed = []
    stemmer = PorterStemmer()
    for file in data:
        
        # lowercasing all text
        
        file = str(file).lower()
        
        
        # removing non-alpha characters
        file = re.sub('[^a-zA-Z]', ' ', file)
        
        # tokenizing articles
        tokenized = word_tokenize(file)
        
        # removing stop words from tokens
        stop_removed_tokens = []
        if remove_stopwords:
            for word in tokenized:
                if word not in stop_words:
                    stop_removed_tokens.append(word)
        else:
            stop_removed_tokens = tokenized
        if stem_data:
            stemmed = []
            for token in stop_removed_tokens:
                stemmed.append(stemmer.stem(token))
            processed.append(stemmed)
        else:
            processed.append(stop_removed_tokens)
    return processed

In [3]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

## Prediction Data Imports

In [4]:
sotu = pd.read_csv('../data/prediction_data/formatted_pred_data/sotu_formatted.csv')
sotu['processed_no_stem'] = preprocess(sotu['Text'], stem_data = False, remove_stopwords = False)

In [5]:
inaug = pd.read_csv('../data/prediction_data/formatted_pred_data/inaugural_formatted.csv')
inaug['processed_no_stem'] = preprocess(inaug['text'], stem_data = False, remove_stopwords = False)

In [6]:
court = pd.read_csv('../data/prediction_data/formatted_pred_data/court_formatted.csv')
court['processed_no_stem'] = preprocess(court['text'], stem_data = False, remove_stopwords = False)

In [7]:
doj = pd.read_csv('../data/prediction_data/formatted_pred_data/doj_formatted.csv')
doj['processed_no_stem'] = preprocess(doj['contents'], stem_data = False, remove_stopwords = False)

## Subjectivity 

In [8]:
# setting model parameters

vocab_size_subj_obj = 5000
embedding_dimension_subj_obj = 8
max_length_subj_obj = 200
trunc_type_subj_obj = 'post'
padding_type_subj_obj = 'post'
oov_tok_subj_obj = '<OOV>'

In [9]:
# importing model and pickled tokenizer 

model_subj_obj = load_model('../data/train_data/subjectivity/subj_obj_model.h5')

with open('../data/train_data/subjectivity/tokenizer_subj_obj.pickle', 'rb') as handle:
    tokenizer_subj_obj = pickle.load(handle)



***State of the Union Speeches***

In [10]:
# serializing and padding sotu speeches with tokenizer

sotu_sequences_subj_obj = tokenizer_subj_obj.texts_to_sequences(sotu['processed_no_stem'])
sotu_padded_subj_obj = pad_sequences(sotu_sequences_subj_obj, 
                                     maxlen=max_length_subj_obj, 
                                     padding=padding_type_subj_obj, 
                                     truncating=trunc_type_subj_obj)

In [11]:
# making sotu predictions and appending to original df

sotu_subj_obj_preds = model_subj_obj.predict(sotu_padded_subj_obj)
sotu['is_subjective_preds'] = sotu_subj_obj_preds

***Department of Justice Statements***

In [12]:
# serializing and padding DOJ statements with tokenizer

doj_sequences_subj_obj = tokenizer_subj_obj.texts_to_sequences(doj['processed_no_stem'])
doj_padded_subj_obj = pad_sequences(doj_sequences_subj_obj, 
                                     maxlen=max_length_subj_obj, 
                                     padding=padding_type_subj_obj, 
                                     truncating=trunc_type_subj_obj)

In [13]:
# making doj predictions and appending to original df

doj_subj_obj_preds = model_subj_obj.predict(doj_padded_subj_obj)
doj['is_subjective_preds'] = doj_subj_obj_preds

***Supreme Court Decisions***

In [14]:
# serializing and padding supreme court decisions with tokenizer

court_sequences_subj_obj = tokenizer_subj_obj.texts_to_sequences(court['processed_no_stem'])
court_padded_subj_obj = pad_sequences(court_sequences_subj_obj, 
                                     maxlen=max_length_subj_obj, 
                                     padding=padding_type_subj_obj, 
                                     truncating=trunc_type_subj_obj)

In [15]:
# making supreme court predictions and appending to original df

court_subj_obj_preds = model_subj_obj.predict(court_padded_subj_obj)
court['is_subjective_preds'] = court_subj_obj_preds

***Inaugural Addresses***

In [16]:
# serializing and padding inaugural addresses with tokenizer

inaug_sequences_subj_obj = tokenizer_subj_obj.texts_to_sequences(inaug['processed_no_stem'])
inaug_padded_subj_obj = pad_sequences(inaug_sequences_subj_obj, 
                                     maxlen=max_length_subj_obj, 
                                     padding=padding_type_subj_obj, 
                                     truncating=trunc_type_subj_obj)

In [17]:
# making inaugural address predictions and appending to original df

inaug_subj_obj_preds = model_subj_obj.predict(inaug_padded_subj_obj)
inaug['is_subjective_preds'] = inaug_subj_obj_preds

## Sentiment

In [18]:
vocab_size_sent = 5000
embedding_dimension_sent = 10
max_length_sent = 1000
trunc_type_sent = 'post'
padding_type_sent = 'post'
oov_tok_sent = '<OOV>'

In [19]:
model_sent = load_model('../data/train_data/sentiment/sent_model.h5')

with open('../data/train_data/sentiment/tokenizer_sent.pickle', 'rb') as handle:
    tokenizer_sent = pickle.load(handle)



***State of the Union Addresses***

In [20]:
# serializing and padding sotu speeches with tokenizer

sotu_sequences_sent = tokenizer_sent.texts_to_sequences(sotu['processed_no_stem'])
sotu_padded_sent = pad_sequences(sotu_sequences_sent, 
                                     maxlen=max_length_sent, 
                                     padding=padding_type_sent, 
                                     truncating=trunc_type_sent)

In [21]:
# making sotu predictions and appending to original df

sotu_sent_preds = model_sent.predict(sotu_padded_sent)
sotu['is_positive_preds'] = sotu_sent_preds

***Department of Justice Statements***

In [22]:
# serializing and padding DOJ statements with tokenizer

doj_sequences_sent = tokenizer_sent.texts_to_sequences(doj['processed_no_stem'])
doj_padded_sent = pad_sequences(doj_sequences_sent, 
                                     maxlen=max_length_sent, 
                                     padding=padding_type_sent, 
                                     truncating=trunc_type_sent)

In [23]:
# making doj predictions and appending to original df

doj_sent_preds = model_sent.predict(doj_padded_sent)
doj['is_positive_preds'] = doj_sent_preds

***Supreme Court Decisions***

In [24]:
# serializing and padding supreme court decisions with tokenizer

court_sequences_sent = tokenizer_sent.texts_to_sequences(court['processed_no_stem'])
court_padded_sent = pad_sequences(court_sequences_sent, 
                                     maxlen=max_length_sent, 
                                     padding=padding_type_sent, 
                                     truncating=trunc_type_sent)

In [25]:
# making supreme court predictions and appending to original df

court_sent_preds = model_sent.predict(court_padded_sent)
court['is_positive_preds'] = court_sent_preds

***Inaugural Addresses***

In [26]:
# serializing and padding inaugural addresses with tokenizer

inaug_sequences_sent = tokenizer_sent.texts_to_sequences(inaug['processed_no_stem'])
inaug_padded_sent = pad_sequences(inaug_sequences_sent, 
                                     maxlen=max_length_sent, 
                                     padding=padding_type_sent, 
                                     truncating=trunc_type_sent)

In [27]:
# making inaugural address predictions and appending to original df

inaug_sent_preds = model_sent.predict(inaug_padded_sent)
inaug['is_positive_preds'] = inaug_sent_preds

## Prediction Exports

In [28]:
sotu.to_csv('../data/prediction_data/predictions/sotu.csv', index = False)

In [29]:
inaug.to_csv('../data/prediction_data/predictions/inaug.csv', index = False)

In [30]:
court.to_csv('../data/prediction_data/predictions/court.csv', index = False)

In [31]:
doj.to_csv('../data/prediction_data/predictions/doj.csv', index = False)