In [3]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re
from numpy import log, mean

required = {'spacy', 'scikit-learn', 'pandas', 'transformers==2.4.1'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [4]:
import random
# trying the google translate api
from googletrans import Translator
translator = Translator()
def translate(text):
    return translator.translate(text).text

# need to load russian spacy model
import stanza
from spacy_stanza import StanzaLanguage

snlp = stanza.Pipeline(lang="ru",processors='tokenize')
nlp = StanzaLanguage(snlp)

2020-08-03 09:33:39 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

2020-08-03 09:33:39 INFO: Use device: cpu
2020-08-03 09:33:39 INFO: Loading: tokenize
2020-08-03 09:33:39 INFO: Done loading processors!


In [5]:
# load data
import pickle
rnc = pickle.load(open("/Users/paigelee/Desktop/clancy/code/rnc-v4.pickle", "rb" ) )
print(rnc.keys())

dict_keys(['blogs_2013', 'fiction', 'public', 'science', 'speech'])


In [6]:
# average or median length of excerpt?
import statistics

file_lengths = []
for folder in rnc.keys():
    for file in rnc[folder].keys():
        file_length = len(rnc[folder][file]['contents']['sentencetext'])
        file_lengths.append(file_length)
        
median = statistics.median(file_lengths)
avg = mean(file_lengths)
print('FILE LENGTHS:')
print('median:', median)
print('average:', avg)

FILE LENGTHS:
median: 66.0
average: 179.7679856115108


In [7]:
# transform texts? make excerpts
def make_excerpts(rnc, excerpt_length, shuffled=True):
    excerpts = []
    labels = []

    for folder in rnc.keys():
        for file in rnc[folder].keys():
            metadict = rnc[folder][file]['metadata']
            excerpt = ""
            i = 0
            if 'sex' in metadict.keys():
                contents = rnc[folder][file]['contents']['sentencetext']
                gender = metadict['sex']
                for sentence in contents:
                    if i < excerpt_length:
                        excerpt = excerpt + sentence
                        i += 1
                    else:
                        excerpts.append(excerpt)
                        labels.append(gender)
                        excerpt = ""
                        i = 0
    if shuffled:
        random.seed(4)
        tuples = list(zip(excerpts, labels))
        random.shuffle(tuples)
        excerpts, labels = zip(*tuples)

    return excerpts, labels

In [8]:
excerpts, labels = make_excerpts(rnc, 5)
print('# excerpts:', len(excerpts))
print('# labels:', len(labels))

# excerpts: 9755
# labels: 9755


In [9]:
male_labeled = []
female_labeled = []

# how many male-labeled and female=labeled texts
for index in range(len(labels)):
    if labels[index] == 'муж':
        male_labeled.append(excerpts[index])
    else:
        female_labeled.append(excerpts[index])
print('# male authored texts:', len(male_labeled))
print('# female authored texts:', len(female_labeled))

# male authored texts: 7132
# female authored texts: 2623


In [12]:
# see a random excerpt and its label
i = random.choice(range(len(excerpts)))
sent = excerpts[i]
gender  = labels[i]
print(gender, translate(gender))
print()
print(sent)
print()
print(translate(sent))

жен wives

Всю свою жизнь, а именно 22 года, кот прожил в семье Бабаевых( г. Баку).В 1977 г. дочь Бабаевых Гюльчохра, тогда ещё школьница, принесла домой двухмесячного котёнка, который оказался очень общительным и сообразительным.Вскоре он стал любимцем всей семьи, но наибольшее внимание уделяла ему Гюльчохра Бабаева.Она много времени проводила с животным, постоянно с ним разговаривала.В результате многолетнего речевого( азербайджанский язык) контакта с человеком и тренинга животное научилось адекватно реагировать на обращённые к нему слова и правильно отвечать на вопросы.

All his life, namely 22 years, the cat lived in the Babayev family (Baku). In 1977, the Babayevs' daughter Gulchohra, then still a schoolgirl, brought home a two-month-old kitten, which turned out to be very sociable and quick-witted. Soon he became a favorite of the whole family. But Gulchohra Babayeva paid the most attention to him. She spent a lot of time with the animal, constantly talked to him. As a result of 

**Tokenizer**

In [13]:
# initialize tokenizer
def simple_tokenizer(doc):
    parsed = nlp(doc)
    return([t.lower_ for t in parsed if t.is_alpha])
# testing simple tokenizer
print(simple_tokenizer(sent))

['всю', 'свою', 'жизнь', 'а', 'именно', 'года', 'кот', 'прожил', 'в', 'семье', 'бабаевых', 'баку', 'в', 'дочь', 'бабаевых', 'гюльчохра', 'тогда', 'ещё', 'школьница', 'принесла', 'домой', 'двухмесячного', 'котёнка', 'который', 'оказался', 'очень', 'общительным', 'и', 'сообразительным', 'вскоре', 'он', 'стал', 'любимцем', 'всей', 'семьи', 'но', 'наибольшее', 'внимание', 'уделяла', 'ему', 'гюльчохра', 'много', 'времени', 'проводила', 'с', 'животным', 'постоянно', 'с', 'ним', 'разговаривала', 'в', 'результате', 'многолетнего', 'речевого', 'азербайджанский', 'язык', 'контакта', 'с', 'человеком', 'и', 'тренинга', 'животное', 'научилось', 'адекватно', 'реагировать', 'на', 'обращённые', 'к', 'нему', 'слова', 'и', 'правильно', 'отвечать', 'на', 'вопросы']


**Exploratory analysis on vectors**

In [51]:
## make different vectors
cv = CountVectorizer(tokenizer=simple_tokenizer, max_df=.99, min_df=.005)
tfidf = TfidfVectorizer(tokenizer=simple_tokenizer, max_df=.99, min_df=.005)
nmf = NMF(n_components=10)
lda = LatentDirichletAllocation(n_components=10)

In [52]:
# tfidf for nmf
tfidf_vecs = tfidf.fit_transform(excerpts).toarray()
nmf_vecs = nmf.fit_transform(tfidf_vecs)
# count for lda
count_vecs = cv.fit_transform(excerpts).toarray()
lda_vecs = lda.fit_transform(count_vecs)

In [53]:
tfidf_vecs.shape

(9755, 853)

In [54]:
count_vecs.shape

(9755, 853)

In [18]:
tfidf_vecs.shape

(9755, 87456)

In [58]:
cd vecs_853

/Users/paigelee/Desktop/ismt-117/final-project/vecs_853


In [59]:
# save numpy arrays as csv files
from numpy import savetxt
# define data
# save to csv file
for vecs, name in [(tfidf_vecs, 'tfidf_vecs'), (nmf_vecs, 'nmf_vecs'), (count_vecs, 'count_vecs'), (lda_vecs, 'lda_vecs')]:
    savetxt(name + '.csv', vecs, delimiter=',')

In [78]:
# get most frequent words in count vecs
sum_words = count_vecs.sum(axis=0) 
words_freq = [(word, sum_words[idx]) for word, idx in list(cv.vocabulary_.items())]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
for idx in range(10):
    print(words_freq[idx])

('и', 19036)
('в', 16744)
('не', 9388)
('на', 7664)
('что', 6333)
('с', 5776)
('а', 4841)
('я', 4688)
('как', 3769)
('это', 3301)


In [101]:
# summing over female authored texts
female_sum_words = tfidf_vecs[is_female].sum(axis=0)
female_words_freq = [(word, female_sum_words[idx]) for word, idx in list(tfidf.vocabulary_.items())]
female_words_freq = sorted(female_words_freq, key = lambda x: x[1], reverse=True)
# summing over male authored texts
male_sum_words = tfidf_vecs[~is_female].sum(axis=0)
male_words_freq = [(word, male_sum_words[idx]) for word, idx in list(tfidf.vocabulary_.items())]
male_words_freq = sorted(male_words_freq, key = lambda x: x[1], reverse=True)
for idx in range(10):
    print('female:', words_freq[idx], '\tmale:', male_words_freq[idx])

female: ('в', 259.46661631964685) 	male: ('и', 840.6485365813268)
female: ('и', 257.87422254389753) 	male: ('в', 760.8955912877203)
female: ('не', 177.5003221922877) 	male: ('не', 488.8817432348205)
female: ('на', 143.57850832478803) 	male: ('на', 456.6732561431596)
female: ('с', 126.93429321471551) 	male: ('я', 400.8459389269238)
female: ('а', 126.50432571409367) 	male: ('с', 380.48595083832)
female: ('что', 125.99934734230546) 	male: ('что', 379.07922961491363)
female: ('я', 115.0657606609694) 	male: ('а', 333.9767010535833)
female: ('это', 109.82014633150047) 	male: ('он', 273.90197370528057)
female: ('у', 95.23245844435249) 	male: ('как', 267.46363631689337)


In [81]:
# convert labels from string to bool to int
is_female_list = [gender == 'жен' for gender in labels]
is_female = np.array(is_female_list)
int_labels = [int(boolean) for boolean in is_female]
is_female_array = np.array(int_labels)
print('is_female:', is_female[:10])
print('is_female_array:', is_female_array[:10])

is_female: [False False False False False False  True False False  True]
is_female_array: [0 0 0 0 0 0 1 0 0 1]


In [105]:
# if not, try this:
# get top x words
top_words = 10
# for pos/neg set
for vectorizer, vecs, to_print  in [(cv, count_vecs, 'count_vecs'), (tfidf, tfidf_vecs, 'tfidf_vecs')]:
    for s, gender in [(is_female, 'female'), (~is_female, 'male')]:    
        # sum counts
        s_sum = vecs[s].sum(axis=0)
        # sort arguments
        s_sorted = np.argsort(s_sum)
        # print top words
        print(to_print, gender, [vectorizer.get_feature_names()[x] for x in s_sorted[-top_words:]])

count_vecs female ['как', 'это', 'я', 'а', 'с', 'что', 'на', 'не', 'в', 'и']
count_vecs male ['он', 'как', 'а', 'я', 'с', 'что', 'на', 'не', 'в', 'и']
tfidf_vecs female ['у', 'это', 'я', 'что', 'а', 'с', 'на', 'не', 'и', 'в']
tfidf_vecs male ['как', 'он', 'а', 'что', 'с', 'я', 'на', 'не', 'в', 'и']


**SVC on Vectors**

In [107]:
print('tfidf', tfidf_vecs.shape)
print('cv', count_vecs.shape)
print('nmf',nmf_vecs.shape)
print('lda', lda_vecs.shape)

tfidf (9755, 853)
cv (9755, 853)
nmf (9755, 10)
lda (9755, 10)


In [108]:
from sklearn.model_selection import train_test_split
random_state = 42
test_size = 0.3
for v in [count_vecs, tfidf_vecs, nmf_vecs, lda_vecs]:
    X_train, X_test, y_train, y_test = train_test_split(v, labels, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    svc = LinearSVC()
    svc.fit(X_train, y_train)
    print(accuracy_score(y_test, svc.predict(X_test)))



0.7642637512811753
0.7840792620430475
0.743423300307482
0.7423983600956611


In [109]:
for v in [np.concatenate([count_vecs, lda_vecs], axis=1),
          np.concatenate([tfidf_vecs, nmf_vecs], axis=1),
          np.concatenate([count_vecs, tfidf_vecs, lda_vecs, nmf_vecs], axis=1),
          np.concatenate([count_vecs, tfidf_vecs], axis=1)]:
    X_train, X_test, y_train, y_test = train_test_split(v, labels, 
                                                        test_size=test_size, random_state=42)
    svc = LinearSVC()
    svc.fit(X_train, y_train)
    print(accuracy_score(y_test, svc.predict(X_test)))



0.7639221045439016
0.7840792620430475




0.7560642295866075
0.7584557567475231




**Attempting an LSTM model using RusVectores**

In [110]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [111]:
def doc_to_index(docs, vocab):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        for w in d:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=200):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features

class SentimentNet(nn.Module):
    # sentiment classifier with single LSTM layer + Fully-connected layer, sigmoid activation and dropout
    # adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self,
                 weight_matrix=None,
                 vocab_size=1000, 
                 output_size=1,  
                 hidden_dim=512,
                 embedding_dim=400, 
                 n_layers=2, 
                 dropout_prob=0.5):
        super(SentimentNet, self).__init__()
        # size of the output, in this case it's one input to one output
        self.output_size = output_size
        # number of layers (default 2) one LSTM layer, one fully-connected layer
        self.n_layers = n_layers
        # dimensions of our hidden state, what is passed from one time point to the next
        self.hidden_dim = hidden_dim
        # initialize the representation to pass to the LSTM
        self.embedding, embedding_dim = self.init_embedding(
            vocab_size, 
            embedding_dim, 
            weight_matrix)
        # LSTM layer, where the magic happens
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        # dropout, similar to regularization
        self.dropout = nn.Dropout(dropout_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        # transform input
        embeds = self.embedding(x)
        # run input embedding + hidden state through model
        lstm_out, hidden = self.lstm(embeds, hidden)
        # reshape
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout certain pct of connections
        out = self.dropout(lstm_out)
        # fully connected layer
        out = self.fc(out)
        # activation function
        out = self.sigmoid(out)
        # reshape
        out = out.view(batch_size, -1)
        out = out[:,-1]
        # return the output and the hidden state
        return out, hidden
    
    def init_embedding(self, vocab_size, embedding_dim, weight_matrix):
        # initializes the embedding
        if weight_matrix is None:
            if vocab_size is None:
                raise ValueError('If no weight matrix, need a vocab size')
            # if embedding is a size, initialize trainable
            return(nn.Embedding(vocab_size, embedding_dim),
                   embedding_dim)
        else:
            # otherwise use matrix as pretrained
            weights = torch.FloatTensor(weight_matrix)
            return(nn.Embedding.from_pretrained(weights),
                  weights.shape[1])
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden
    
def train_model(model, train_loader, val_loader, model_params, training_params):
    # utility for running the training process
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=training_params['learning_rate'])
    epochs = training_params['epochs']
    batch_size = training_params['batch_size']
    # print options
    counter = 0
    print_every = 5
    clip = 5
    valid_loss_min = np.Inf
    model.train()
    for i in range(epochs):
        h = model.init_hidden(batch_size)
        for inputs, labels in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            output, h = model(inputs, h)
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            if counter%print_every == 0:
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for inp, lab in val_loader:
                    val_h = tuple([each.data for each in val_h])
                    inp, lab = inp.to(device), lab.to(device)
                    out, val_h = model(inp, val_h)
                    val_loss = criterion(out.squeeze(), lab.float())
                    val_losses.append(val_loss.item())

                model.train()
                print("Epoch: {}/{}...".format(i+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
                if np.mean(val_losses) <= valid_loss_min:
                    torch.save(model.state_dict(), './state_dict.pt')
                    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                    valid_loss_min = np.mean(val_losses)
    return(model)
    
def assess_accuracy(model, test_loader, model_params, training_params):
    # utility for assessing accuracy
    batch_size = training_params['batch_size']
    model.load_state_dict(torch.load('./state_dict.pt'))
    h = model.init_hidden(batch_size)
    num_correct = 0
    model.eval()
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model(inputs, h)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        # take the correct labels, check against preds
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        # sum the number of correct
        num_correct += np.sum(correct)
    # calc accuracy
    test_acc = num_correct/len(test_loader.dataset)
    print('LSTM accuracy:', test_acc)

In [112]:
from sklearn.model_selection import train_test_split
random_state = 42
test_size = 0.3

# separate excerpts (lists) of strings and is_female_array of booleans into train, test, split
X_train, X_val_test, y_train, y_val_test = train_test_split(excerpts, is_female_array, test_size=test_size, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=test_size)

# check sizes
print('X_train:', len(X_train), ', y_train:', len(y_train))
print('X_val:', len(X_val), ', y_val:', len(y_val))
print('X_test:', len(X_test), ', y_test:', len(y_test))
print(X_train[:2], y_train[:2])

X_train: 6828 , y_train: 6828
X_val: 2048 , y_val: 2048
X_test: 879 , y_test: 879
['Глядели и вздыхали, вспоминая былое.Так было и нынче.Яков через отворённую форточку приказывал:Сапоги чисто промывайте!Не тягайте грязь!', 'Мы спустились, и Юрка из предосторожности поднял лестницу.В силках билась птица с разноцветным хвостом.Мила тут же, в зарослях, ощипала её, и мы отправились обратно.У меня нет слов, чтобы описать тот ужас, который охватил нас, когда мы услышали на поляне голоса пиратов.Скрываясь в высокой траве, нам удалось добраться незамеченными до самой поляны.'] [0 0]


In [113]:
# tokenize samples
parsed_train = [simple_tokenizer(str(d)) for d in X_train]
parsed_val = [simple_tokenizer(str(d)) for d in X_val]
parsed_test = [simple_tokenizer(str(d)) for d in X_test]

# assert parsed lists are looking good
print('len parsed_train:', len(parsed_train))
print(parsed_train[0])

KeyboardInterrupt: 

In [None]:
with open('parsed_lists.txt', 'w') as t:
    t.write(parsed_train)
    t.write(parsed_val)
    t.write(prased_test)

In [None]:
# this formulation works if you have previously tokenized
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=0.01)
tfidf = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=0.01)

# **important** just fit on trained: prevents information from test in training 
cv_train = cv.fit_transform(parsed_train)
tfidf_train = tfidf.fit_transform(parsed_train)

# get out the vocab (same for tfidf)
vocab = cv.vocabulary_
print("Size of vocab:", len(vocab))

**sketchy area of setup**

In [None]:
# need to adapt vocab, leave space for padding
vocab = cv.vocabulary_
vocab = dict([(v, vocab[v]+2) for v in vocab])

# is this a word2vec thing? hopefully not
vocab['_UNK'] = 1
vocab['_PAD'] = 0

In [None]:
# what does 'vocab look like'? can you get items?

In [None]:
# update the glove vecs accordingly
# populating an array with the vector values of stuff
rus_vecs = np.zeros(shape=(len(vocab), 300))
for k, v in vocab.items():
    if k in model.vocab.keys():
        rus_vecs[v] = model.wv[word]
    else:
        rus_vecs[v] = np.zeros(300).reshape(300)
# NOT SURE IF THIS IS CORRECT YET

** ok less sketchy **

In [None]:
model_params = {'weight_matrix': rus_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'embedding_dim': 400,
               'dropout_prob': 0.2}
training_params = {'learning_rate': 0.005,
                  'epochs': 1,
                  'batch_size': 100}

# create padded datasets for train, val, test
parsed_train = doc_to_index(parsed_train, vocab)
padded_train = pad_sequence(parsed_train)

parsed_val = doc_to_index(parsed_val, vocab)
padded_val = pad_sequence(parsed_val)

parsed_test = doc_to_index(parsed_test, vocab)
padded_test = pad_sequence(parsed_test)

# construct datasets for loading by PyTorch
train_data = TensorDataset(torch.from_numpy(padded_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(padded_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(padded_test), torch.from_numpy(y_test))

# you'll need to re-create loaders for changes to batch size
batch_size = training_params['batch_size']

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

In [None]:
train_model(SentimentNet(**model_params), train_loader, val_loader, model_params, training_params)
assess_accuracy(SentimentNet, test_loader, model_params, training_params)