In [4]:
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC

import features
import config
import data_loader
import os
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import numpy as np
import eli5
import nltk
from IPython.display import display
import numpy as np

seed_value = 42  # random seed of 42 for all experiments
os.environ['PYTHONHASHSEED'] = str(seed_value)
np.random.seed(seed_value)

In [5]:
def train_tfidf(train):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', encoding='utf-8')
    train_sentences, test_sentences = [], []
    for x in train:
        train_sentences.append(features.get_words_str(x))
    X_train = vectorizer.fit_transform(train_sentences)
    return vectorizer, X_train


def make_predictions(annotated_files, vectorizer, svm):
    count = 0
    test_sentences = []
    filenames = []

    rootdir = '../../dataset/minNarrative_txtfiles/'
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if file not in annotated_files: # Only look at non-annotated files
                filepath = os.path.join(subdir, file)
                filenames.append(filepath)
                count += 1
                with open(filepath) as file:
                    text = file.read().strip()
                test_sentences.append(' '.join(features.filter_punct(word_tokenize(text))))
    X_test = vectorizer.transform(test_sentences)

    print("Making predictions for {} documents".format(count))
    preds_with_probs = svm.predict_proba(X_test)[:, 1]

    return preds_with_probs, filenames, X_test


def SVM_model(X_train, Y):
    tuned_parameters = [{'C': [1], 'kernel': ['linear']}, ]
    algo = SVC(probability=True)

    param_object = ParameterGrid(tuned_parameters)
    for param_dict in param_object:
        print("Running for parameters:", param_dict)
        algo.set_params(**param_dict)  # set the desired hyperparameters

        print("Training SVM TFIDF model with {} documents".format(Y.shape[0]))
        clf = algo.fit(X_train, Y)

        return clf

    
def average_probabilities(preds_with_probs, filenames):
    genre_probabilities = dict()
    for i in range(len(filenames)):
        genre = filenames[i].split('/')[-1].split('_')[0]
        if genre == 'FOX' or genre == 'CNN':
            genre = 'OPINION'

        if genre in genre_probabilities:
            genre_probabilities[genre].append(preds_with_probs[i])
        else:
            genre_probabilities[genre] = [preds_with_probs[i]]

    for k, v in genre_probabilities.items():
        avg = round(sum(v) / len(v), 3)
        print('Genre {}, {} documents, average predicted probability: {}'.format(k, len(v), avg))


In [6]:
# Loads annotated filenames and labels (334 files)
X, Y = data_loader.load_annotated_data(threshold=2.5)

# Creates Tfidf vectorizer using the annotated files
vectorizer, X_train = train_tfidf(X)

# Trains SVM model using the annotated files
svm = SVM_model(X_train, Y)

# Makes predictions on the rest of the data set (17k+ documents) using the SVM model and Tfidf Vectorizer
preds_with_probs, filenames, X_test = make_predictions(X, vectorizer, svm)

Loading annotated data from: ../../dataset/MinNarrative_ReaderData_Final.csv
Running for parameters: {'C': 1, 'kernel': 'linear'}
Training SVM TFIDF model with 334 documents
Making predictions for 17372 documents


# Average predicted probability per genre

In [7]:
average_probabilities(preds_with_probs, filenames)

Genre PHIL, 538 documents, average predicted probability: 0.216
Genre SCOTUS, 965 documents, average predicted probability: 0.132
Genre APHORISM, 467 documents, average predicted probability: 0.171
Genre OPINION, 1611 documents, average predicted probability: 0.647
Genre BIO, 973 documents, average predicted probability: 0.9
Genre FABLE, 263 documents, average predicted probability: 0.969
Genre SHORT, 488 documents, average predicted probability: 0.973
Genre REDDIT, 980 documents, average predicted probability: 0.98
Genre ROC, 977 documents, average predicted probability: 0.983
Genre BREVIEW, 864 documents, average predicted probability: 0.47
Genre NOVEL19C, 1025 documents, average predicted probability: 0.926
Genre FAIRY, 764 documents, average predicted probability: 0.943
Genre HIST, 1050 documents, average predicted probability: 0.773
Genre FLASH, 877 documents, average predicted probability: 0.952
Genre ACADEMIC-SCIENCE, 967 documents, average predicted probability: 0.086
Genre ACA

# Feature Analysis

In [8]:
# Get corresponding feature names of Tfidf Vectorizer
feature_names = list(vectorizer.get_feature_names_out())

In [9]:
# Get index of passages with lowest and highest predicted degree of narrativty
highest_degree = np.where(preds_with_probs ==np.amax(preds_with_probs))[0][0]
lowest_degree = np.where(preds_with_probs ==np.amin(preds_with_probs))[0][0]
print(filenames[highest_degree])
print(filenames[lowest_degree])

../../dataset/minNarrative_txtfiles/opinions/fox-articles/FOX_janice-dean-coronavirus-in-laws-cuomo.txt
../../dataset/minNarrative_txtfiles/legal-contracts/LEGAL_5S_0_ASIANDRAGONGROUPINC_08_11_2005-EX-10-5-Reseller-Agreement.txt


In [10]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [15]:
def show_weights(fname):
    with open(fname) as file:
        text = file.read().strip()
        text = ' '.join(word_tokenize(text))
        display(eli5.explain_prediction(svm, text, vec=vectorizer, feature_names=feature_names, targets=['POS']))

print(preds_with_probs[lowest_degree])
print(filenames[lowest_degree])
show_weights(filenames[lowest_degree])

1.0951326855188498e-05
../../dataset/minNarrative_txtfiles/legal-contracts/LEGAL_5S_0_ASIANDRAGONGROUPINC_08_11_2005-EX-10-5-Reseller-Agreement.txt


Contribution?,Feature
0.064,<BIAS>
-2.105,Highlighted in text (sum)


In [17]:
print(preds_with_probs[highest_degree])
print(filenames[highest_degree])
show_weights(filenames[highest_degree])

0.9999999999999699
../../dataset/minNarrative_txtfiles/opinions/fox-articles/FOX_janice-dean-coronavirus-in-laws-cuomo.txt


Contribution?,Feature
2.741,Highlighted in text (sum)
0.064,<BIAS>


In [18]:
eli5.show_weights(svm, feature_names=feature_names)

Weight?,Feature
+1.639,was
+1.634,he
+1.419,had
+1.393,you
+1.308,his
+1.045,she
+1.020,my
+0.958,him
+0.898,said
+0.777,her
