In [4]:
import os
import numpy as np
import multiprocessing as mp

import nltk

import torch
from torch import nn

import torchtext.data as data
import torchtext.vocab as vocab
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import gensim
import gensim.downloader as api

  warn(f"Failed to load image Python extension: {e}")


In [39]:
a = np.ones(5)
('"%s",'*5)%tuple(a)

'"1.0","1.0","1.0","1.0","1.0",'

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)

print(mp.cpu_count(),' CPUs available')

Device: cuda
16  CPUs available


In [23]:
# read a text file
filelist = os.listdir('data/train')
filepath = 'data/train/'+filelist[50]

# read the raw text file
print('opening the file:', filepath)
with open(filepath, 'r', encoding='utf-8') as file:
    text = file.read()

opening the file: data/train/00D6341E8688.txt


## Essay Metrics
Various measurements taken for a given set of text.

### Burstiness

In [24]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/rachel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rachel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def token_word(essay):
    tokens = nltk.word_tokenize(essay.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens
def token_sent(essay):
    tokens = nltk.sent_tokenize(essay.lower())
    tokens = [token for token in tokens if token not in string.punctuation]
    return tokens

# The following function calculates burstiness of an essay
def burstiness(essay):
    sentences = token_sent(essay)
    num_words   = len(token_word(essay))  #Total number of words in text
    num_sents   = len(sentences)  #Total number of sentences in text
    avg_freq = num_words/num_sents #Average number of words per sentence 
    variance = sum((len(sentence.split()) - avg_freq) ** 2 for sentence in sentences) / len(sentences)
    return variance

In [26]:
burstiness(text)

103.63905325443787

### Seniment

In [27]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [28]:
sia_scores = sia.polarity_scores(text)
sia_scores

{'neg': 0.065, 'neu': 0.845, 'pos': 0.09, 'compound': 0.8956}

### Put it all in an Array

In [29]:
metrics = np.array([burstiness(text), sia_scores['pos'] , sia_scores['neg'], sia_scores['neu'], sia_scores['compound']])
print(metrics)

[1.03639053e+02 9.00000000e-02 6.50000000e-02 8.45000000e-01
 8.95600000e-01]


## Preprocessing
Numbers to words https://github.com/collin5/python-n2w

### Embeddings for text
Can use Pre-trained embeddings from gensim (https://www.scaler.com/topics/pytorch/text-representation-pytorch/)

Let's load up a big trained version of word2vec

In [30]:
%%time
w2v = api.load('word2vec-google-news-300')

CPU times: user 21.6 s, sys: 687 ms, total: 22.3 s
Wall time: 22.4 s


### Spell corrections and punctuation removal

In [31]:
# tokenize into words and remove punctuation
# using the pre-processing from the spell-check code
from nltk.tokenize import RegexpTokenizer
import contractions #library pertaining to contractions (things like "don't" and "you're")
from nltk.metrics.distance import jaccard_distance #distance we'll use to find the nearest correct word
from nltk.util import ngrams

nltk.download('words') #nltk's collection of words
from nltk.corpus import words 
from nltk.metrics.distance import jaccard_distance #distance we'll use to find the nearest correct word
from nltk.util import ngrams

import n2w

[nltk_data] Downloading package words to /home/rachel/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [32]:
# The following code loads the list of most common english words and adds numbers to it as I assume 
# we don't want to count the use of numbers as a spelling mistake.

words_into_list = words.words() #this is a text file with 1 word per line
words_into_list = words_into_list+[str(i) for i in range(0,1000000)] #add numbers to the list
words_lower = [word.lower() for word in words_into_list] #we will make all words lowercase
word_set = set(words_lower)
word_arr = np.array(list(word_set))

def preprocess_spelling(essay):
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') #This tokenises strings that consist of characters and number, i.e. it removes other symbols
    text_no_contr = contractions.fix(essay)  #Expands all contractions in essays. For example, converts "you're" to "you are".
    words_no_punct= tokenizer.tokenize(text_no_contr.lower())  #Removes all non-letter and non-number symbols. Also makes everything lowercase.
    return words_no_punct


def corrected(essay):
    errors=[]
    clean_essay = preprocess_spelling(essay)
    correct_essay_words = clean_essay
    for word_index in range(0,len(clean_essay)): #loop through words in each essay
        word = clean_essay[word_index]
        if word not in word_set: #if a word is not contained in our word_set, correct it using jaccard
            temp = [(jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w) 
            for w in word_set if w[0]==word[0]] 
            correct_word = sorted(temp, key = lambda val:val[0])[0][1] #corrected word
            correct_essay_words[word_index] = correct_word
    correct_essay = " ".join(correct_essay_words)
    return correct_essay

In [33]:
words_into_list = words.words() #this is a text file with 1 word per line
#words_into_list = words_into_list+[str(i) for i in range(0,1000000)] #add numbers to the list
words_lower = [word.lower() for word in words_into_list] #we will make all words lowercase
word_set = set(words_lower)
word_arr = np.array(list(word_set))


def preprocess(essay, word_tokenize=False):
    """
    Do a lot of the pre-processing of the text
    input:
        essay (string)
    output:
        correct_essay_words (list of strings) cleaned/corrected essay words in order
    """
    
    clean_essay = preprocess_spelling(essay)
    correct_essay_words = clean_essay
    
    # use numpy to avoid the nested loops
    
    # find the indices where the words arent in the word set
    missing_word_inds = np.where(~np.isin(np.array(clean_essay),word_arr))[0]
    
    for word_index in missing_word_inds:
        word = clean_essay[word_index]

        
        try: 
            # first check if the word is in word2vec
            w2v.get_vector(word)
        except KeyError:
            # check if it's a number first
            num = n2w.convert(word)
            if num=='Input not a valid number':
                temp = [(jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))),w) for w in word_set if w[0]==word[0]]
                correct_word = sorted(temp, key = lambda val:val[0])[0][1] #corrected word
                correct_essay_words[word_index] = correct_word
            else:
                correct_essay_words[word_index] = num
            
    if word_tokenize:
        return correct_essay_words
    else:
        correct_essay = " ".join(correct_essay_words)
        return correct_essay

In [34]:
remap = {'a':'the', 'and':'also', 'of':'in', 'to':'at'}

In [35]:
%%time
processed_text = preprocess(text)

processed_text_words = processed_text.split()# split by whitespace
essay_words = [remap[word] if word in remap.keys() else word for word in processed_text_words]

CPU times: user 972 ms, sys: 10 ms, total: 982 ms
Wall time: 982 ms


In [36]:
%%time
vecs = []
missing_words =[]
for word in essay_words:
    try:
        vec = w2v.get_vector(word)
        vecs.append(vec)
    except KeyError:
        # this means that the word isn't in the w2v
        missing_words.append(word)
vecs = np.array(vecs)
print(vecs.shape)

#print all the words that are missing
print('missing', np.unique(np.array(missing_words)))

(462, 300)
missing ['addu' 'aldus' 'bogota' 'eveweed' 'lichenes' 'milliad' 'stret']
CPU times: user 0 ns, sys: 6.87 ms, total: 6.87 ms
Wall time: 6.49 ms


## Define a Model
https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

In [108]:
class Classifier(nn.Module):
    
    def __init__(self, N_text_layers=1, N_text_in = 300, N_text_out=128, use_LSTM=False, N_metrics=5):
        super().__init__()
        self.flatten = nn.Flatten()
        # RNN or LSTM part
        if use_LSTM:
            self.text_read = nn.LSTM(input_size= N_text_in, hidden_size=N_text_out, num_layers=N_text_layers)
        else:
            self.text_read = nn.RNN(input_size= N_text_in, hidden_size=N_text_out, num_layers=N_text_layers)
        # linear NN part
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(N_text_out + N_metrics, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2), # last layer is outputting probabilities
        )
        
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, metrics):
        #x = self.flatten(x)
        
        # do the RNN or LSTM part
        # TODO: look into adding attention mechanism
        text_out = self.text_read(x)
        
        # concatenate
        x2 = torch.cat((text_out[1], metrics), axis=1)
        # text_out[1] is the final hidden state for each element in the batch.
        
        # put both the metric array 
        logits = self.linear_relu_stack(x2)
        
        # softmax to return probabilities
        return self.softmax(logits)

### Check that we can pass our inputs into our model

In [109]:
# create model
model = Classifier(N_metrics=len(metrics))
model = model.to(device)# put it on gpu
model

Classifier(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (text_read): RNN(300, 128)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=133, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=2, bias=True)
  )
  (softmax): Softmax(dim=1)
)

In [119]:
# convert to pytorch tensors and put them on gpu
text_in = torch.from_numpy(vecs).to(device).reshape(-1,300)
met_in =  torch.from_numpy(metrics).to(device).reshape(1,-1).float()
text_in.shape, met_in.shape

(torch.Size([330, 300]), torch.Size([1, 5]))

In [120]:
model.text_read(text_in)[1].shape

torch.Size([1, 128])

In [121]:
model.forward(text_in,met_in)

tensor([[0.9047, 0.0953]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

That worked and gave us two values between 0 and 1 like we wanted. I want to quickly make use it handles batches correctly.

### Inputting batches

In [137]:
# read a text file
filelist = os.listdir('train')
filepath = 'train/'+filelist[2]

# read the raw text file
print('opening the file:', filepath)
with open(filepath, 'r', encoding='utf-8') as file:
    text2 = file.read()
    
essay_words2 = preprocess_spelling(text2)
print(len(essay_words2))

opening the file: train/000E6DE9E817.txt
274


In [138]:
vecs2 = []
missing_words2 =[]
for word in essay_words2:
    try:
        vec = w2v.get_vector(word)
        vecs2.append(vec)
    except KeyError:
        # this means that there is a misspelling or the word is too generic
        missing_words2.append(word)
vecs2 = np.array(vecs2)
print(vecs2.shape)

# all the words that are missing
np.unique(np.array(missing_words2))

(244, 300)


array(['a', 'and', 'incasing', 'of', 'to'], dtype='<U8')

In [188]:
sia_scores2 = sia.polarity_scores(text2)
metrics2 = np.array([burstiness(text2), sia_scores2['pos'] , sia_scores2['neg'], sia_scores2['neu'], sia_scores2['compound']])

In [196]:
# convert to pytorch tensors and put them on gpu
text_in2 = torch.from_numpy(vecs2).to(device).reshape(-1,300)
met_in2 =  torch.from_numpy(metrics2).to(device).reshape(1,-1).float()

In [197]:
text_in_batch = torch.cat((text_in,text_in2), axis=0)
met_in_batch = torch.cat((met_in, met_in2), axis=0)

In [198]:
text_in.shape, text_in2.shape

(torch.Size([571, 300]), torch.Size([244, 300]))

In [199]:
met_in_batch.shape

torch.Size([2, 5])

In [200]:
text_in_batch.shape

torch.Size([815, 300])

Oh wait, this isn't right because we need to use padding to ensure that the essays are "all the same length" if we want to put things into batches. I'll come back to this, but we have an initial model that can (theoretically) be trained.