In [None]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from gensim.models import Word2Vec
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
data = pd.read_csv('adjudicated.txt', sep='\t', header=None, names=['ID', 'Adjudicated', 'Label', 'Text'])
data.head()

Unnamed: 0,ID,Adjudicated,Label,Text
0,1,adjudicated,Adolescent,Summary: Helen Hunt Jackson is probably most f...
1,2,adjudicated,Adult,Summary: Dr. Woodson describes the internal mi...
2,3,adjudicated,Child,"Summary: In the summer, Don and Joyce stay on ..."
3,4,adjudicated,Young Adult,"Summary: ""But the Knyght was a little less tha..."
4,5,adjudicated,Adult,Summary: The young Niel Herbert idolizes Maria...


In [None]:
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data.to_csv('splits/train.txt', sep='\t', index=False, header=False)
dev_data.to_csv('splits/dev.txt', sep='\t', index=False, header=False)
test_data.to_csv('splits/test.txt', sep='\t', index=False, header=False)

In [None]:
num_training_data = train_data.shape[0]
num_dev_data = dev_data.shape[0]
num_test_data = test_data.shape[0]

print(num_training_data, num_dev_data, num_test_data)

300 100 100


In [None]:
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])

    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[2].lstrip().rstrip()
            text = cols[3]

            X.append(text)

            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)

    return X, Y, orig_Y

In [None]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape


        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

In [None]:
#stopwords .44 ->.47
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#def binary_bow_featurize(text):
#    lemmatizer =  WordNetLemmatizer()
#    stop_words = set(stopwords.words('english'))
#    feats = {}
#    words = nltk.word_tokenize(text)
#
#    for word in words:
#        word=word.lower()
#        if word not in stop_words:
#          lemma = lemmatizer.lemmatize(word)
#          feats[lemma]=1

#    return feats

elmo = hub.load("https://tfhub.dev/google/elmo/3")

def featurize_with_elmo(texts, elmo_model):
    embeddings_list = []
    for text in texts:
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_text = ' '.join(filtered_words)

        embeddings = elmo_model.signatures["default"](tf.constant([filtered_text]))["elmo"]
        embeddings_list.append(tf.reduce_mean(embeddings, axis=0))
    return np.stack(embeddings_list)


def confidence_intervals(accuracy, n, significance_level):
    critical_value = (1 - significance_level) / 2
    z_alpha = -1 * norm.ppf(critical_value)
    se = np.sqrt((accuracy * (1 - accuracy)) / n)
    return accuracy - (se * z_alpha), accuracy + (se * z_alpha)

def run(trainingFile, devFile, testFile, ordinal_values):
    trainX, trainY, orig_trainY = load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY = load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY = load_ordinal_data(testFile, ordinal_values)

    train_elmo_embeddings = featurize_with_elmo(trainX, elmo)
    dev_elmo_embeddings = featurize_with_elmo(devX, elmo)
    test_elmo_embeddings = featurize_with_elmo(testX, elmo)

    simple_classifier = OrdinalClassifier(ordinal_values, train_elmo_embeddings, trainY, dev_elmo_embeddings, devY, test_elmo_embeddings, testY, orig_trainY, orig_devY, orig_testY)
    simple_classifier.train()
    accuracy = simple_classifier.test()

    lower, upper = confidence_intervals(accuracy, len(testY[0]), 0.95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))



short_text = "Summary:Helen Hunt Jackson is probably most famous for her work on behalf of Native Americans' rights. However, this short volume presents a sonnet for each month of the year, devoted simply and beautifully to the shifting wonder of nature through the seasons.First:Still lie the sheltering snows, undimmed and white;And reigns the winter's pregnant silence still;No sign of spring, save that the catkins fill,Andwillow stems grow daily red and bright.These are the days when ancients held a riteOf expiation for the old year's ill,And prayer to purify the new year's will:Fit days, ere yet the spring rains blur the sight,Ere yet the bounding blood grows hot with haste,And dreaming thoughts grow heavy with a greedThe arden t summer's joy to have and taste;Fit days, to give to last year's losses heed,To reckon clear the new life's sterner need;Fit days, for Feast of Expiation placed Random:Still lie the sheltering snows, und"
elmo_embeddings_short = featurize_with_elmo([entry['text'] for entry in [{'text': short_text}]], elmo)
print(elmo_embeddings_short)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[[[-0.36306384 -0.46500692  0.02209128 ...  0.01264238  0.225127
   -0.41325927]
  [-0.07679628 -0.7409989   0.13116807 ... -0.13802241  0.23854387
   -0.3249876 ]
  [-0.40705174  0.12861726  0.02373165 ... -0.6150391   0.01650539
   -0.4929254 ]
  ...
  [-0.12258971 -0.32924876 -0.37300634 ... -0.19395804  0.3418563
    0.03798784]
  [ 0.09538865 -0.14077085  0.16110605 ... -0.03321084  0.75753534
    0.4751028 ]
  [ 0.06599832 -0.11553025  0.08054374 ...  0.0118159   0.06909603
    0.27351278]]]


Bow: lemmatization, stopword removal
Feature eng: TF-IDF Vect, word embeddings, POS
Architecture: Bi-LSTM, Attention Mechanisms, BERT
self training

In [None]:
trainingFile = 'splits/train.txt'
devFile = 'splits/dev.txt'
testFile = 'splits/test.txt'
ordinal_values = ["Child", "Adolescent", "Young Adult", "Adult"]
run(trainingFile, devFile, testFile, ordinal_values)

KeyboardInterrupt: 