In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import LogisticRegression

In [2]:
from score import load_dataset_fast, score, save_preds, score_preds, SCORED_PARTS

In [3]:
part2xy = load_dataset_fast('FILIMDB', parts=SCORED_PARTS+('train_unlabeled',))

Loading train set 
neg 7480
pos 7520
Loading dev set 
neg 5020
pos 4980
Loading test set 
unlabeled 25000
Loading dev-b set 
pos 994
neg 1006
Loading test-b set 
unlabeled 8599
Loading train_unlabeled set 
unlabeled 50000


In [4]:
train_ids, train_texts, train_labels = part2xy['train']
_, train_unlabeled_texts, _ = part2xy['train_unlabeled']

In [5]:
all_texts = list(text for _, text, _ in part2xy.values()) # это подается в pretrain!
total_texts = sum(len(text) for text in all_texts)
total_texts

110599

In [6]:
part2xy.keys() # test, test-b, train_unlabeled - нет ответов

dict_keys(['train', 'dev', 'test', 'dev-b', 'test-b', 'train_unlabeled'])

In [7]:
train_texts[2]

'After watching the first 20mn of Blanche(sorry I couldn\'t take more of it), I have now confirmed she does not. <br /><br />Basically, this "movie" is an insult to the real french actors participating in this farcical piece of junk. It starts from a concept successfully used in French comedies ("Deux heures moins le quart avant Jesus Christ", "La Folie des Grandeurs",...): a historical movie with anachronic tone / dialogues. This can give brilliant results if supported by brilliant actors and a "finesse" of direction avoiding the dreaded "heavy comedy" stigma.<br /><br />Unfortunately, the horsey-faced Lou Doillon ruins everything and Blanche, instead of a comedy, just turns into an horror movie. Horror to cinephiles who want to be puzzled and shocked watching fine actors such as Decaune, Zem or Rochefort struggling in the middle of this gaudy burlesque kitchy-prissy farce.'

In [8]:
train_labels[2]

'neg'

### 1-2. Предобработка и токенизация

In [9]:
def make_texts_better(texts):
    for i in range(0, len(texts)):
        texts[i] = texts[i].strip().lower()
       # texts[i] = ' '.join(texts[i])                    # строчка для отладки функции
        texts[i] = ' '.join(texts[i].split('<br />'))
        texts[i] = ' , '.join(texts[i].split(','))
        texts[i] = ' . '.join(texts[i].split('.'))
        texts[i] = ' " '.join(texts[i].split('"'))
        texts[i] = ' : '.join(texts[i].split(':'))
        texts[i] = ' ; '.join(texts[i].split(';'))
        texts[i] = ' ! '.join(texts[i].split('!'))
        texts[i] = ' ( '.join(texts[i].split('('))
        texts[i] = ' ) '.join(texts[i].split(')'))
        texts[i] = ' / '.join(texts[i].split('/'))
        texts[i] = ' - '.join(texts[i].split('-'))

In [10]:
%%time
for texts in all_texts:
    make_texts_better(texts)
train_texts[0]

Wall time: 3.9 s


'if the myth regarding broken mirrors would be accurate ,  everybody involved in this production would now face approximately 170 years of bad luck ,  because there are a lot of mirrors falling to little pieces here .  if only the script was as shattering as the glass ,  then  " the broken "  would have been a brilliant film .  now it\'s sadly just an overlong ,  derivative and dull movie with only just a handful of remarkable ideas and memorable sequences .  sean ellis made a very stylish and elegantly photographed movie ,  but the story is lackluster and the total absence of logic and explanation is really frustrating .  i got into a discussion with a friend regarding the basic concept and  " meaning "  of the film .  he thinks ellis found inspiration in an old legend claiming that spotting your doppelganger is a foreboding of how you\'re going to die .  interesting theory ,  but i\'m not familiar with this legend and couldn\'t find anything on the internet about this ,  neither .  p

In [11]:
def make_tokenized(texts):
    for i in range(0, len(texts)):
        texts[i] = texts[i].split(' ')
        texts[i] = list(filter(None, texts[i]))

In [12]:
%%time
for texts in all_texts:
    make_tokenized(texts)

Wall time: 5.82 s


In [13]:
print(train_texts[110])

['this', 'movie', 'is', 'a', 'terrible', 'waste', 'of', 'time', '.', 'although', 'it', 'is', 'only', 'an', 'hour', 'and', 'a', 'half', 'long', 'it', 'feels', 'somewhere', 'close', 'to', '4', '.', 'i', 'have', 'never', 'seen', 'a', 'movie', 'move', 'so', 'slowly', 'and', 'so', 'without', 'a', 'purpose', '.', 'this', 'is', 'also', 'a', '"', 'horror', '"', 'film', 'that', 'takes', 'place', 'a', 'lot', 'of', 'the', 'time', 'during', 'daylight', '.', 'my', 'friend', 'and', 'i', 'laughed', 'an', 'insane', 'amount', 'of', 'times', 'when', 'we', 'were', 'probably', 'supposed', 'to', 'be', 'scared', '.', 'the', 'only', 'thing', 'we', 'want', 'to', 'know', 'is', 'why', 'such', 'a', 'terrible', 'movie', 'was', 'released', 'in', 'so', 'many', 'countries', '.', 'it', 'cannot', 'be', 'that', 'high', 'in', 'demand', '.', 'the', 'supermodel', 'nicole', 'petty', 'should', 'stick', 'to', 'modeling', 'because', 'although', 'she', 'is', 'beautiful', 'she', 'lost', 'her', 'accent', 'so', 'many', 'times', '

##### Почему важно, чтобы тестовая и обучающая выборка обрабатывались одинаково?
Чтобы наиболее корректно определить класс, нужно чтобы данные были приведены к какому-то общему виду, и, полагаясь на этот общий вид, делается обработка.
<br/>
Например, мы приводим все буквы к нижнему регистру, чтобы выделить одинаковые слова, написанные в разном виде. Так как смысл у них один и тот же, то будет эффективнее иметь один общий вид для них, чтобы не терять смысла и не считать одинаковые слова за разные. Так же делается и с тестовой выборкой, аналогично обучающей выделяются нужные слова в нужном виде, чтобы классификация была как можно точнее.

### 3. Построить словарь

In [14]:
%%time
ngrams = {}

for texts in all_texts:
    for text in texts:
        # униграммы
        for ugram in text:
            if not (ugram in ngrams):
                ngrams[ugram] = 1
            else:
                ngrams[ugram] += 1
len(ngrams)

Wall time: 17.6 s


196603

In [15]:
%%time
# биграммы
for texts in all_texts:
    for text in texts:
        for j in range(1, len(text)):
            tupl = (text[j - 1], text[j])
            if not (tupl in ngrams):
                ngrams[tupl] = 1
            else:
                ngrams[tupl] += 1
len(ngrams)

Wall time: 49.1 s


3697763

In [16]:
%%time
# триграммы
for texts in all_texts:
    for text in texts:
        for j in range(2, len(text)):
            tupl = (text[j - 2], text[j - 1], text[j])
            if not (tupl in ngrams):
                ngrams[tupl] = 1
            else:
                ngrams[tupl] += 1
len(ngrams)

Wall time: 1min 3s


15508781

### 4. Отсечка до около миллиона элементов в словаре

In [17]:
%%time
words = list(ngrams.keys())  # удалить слишком редко встречающиеся слова
for key in words:
     if ngrams[key] <= 4:
        del ngrams[key]
len(ngrams)

Wall time: 10.5 s


1111730

In [18]:
%%time
sorted_grams = sorted(ngrams, key=lambda x: int(ngrams[x]), reverse=True)
# хочется удалить самые частовстречающиеся токены, не несущие смысла (не намекающие на тональность отзыва)

sorted_grams[:27]

Wall time: 1.32 s


['the',
 '.',
 ',',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 '-',
 '"',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 ('of', 'the'),
 ')',
 '(',
 ('.', 'the'),
 'on']

In [19]:
%%time
for item in sorted_grams[:27]:
    del ngrams[item]

Wall time: 1 ms


In [20]:
len(ngrams)

1111703

In [21]:
%%time
probs = np.empty(len(ngrams), dtype=float)
n = 0
for key in ngrams.keys():
    probs[n] = ngrams[key]
    ngrams[key] = n
    n += 1
n

Wall time: 1.39 s


1111703

In [22]:
probs

array([6.8121e+04, 3.2400e+02, 7.4900e+02, ..., 5.0000e+00, 5.0000e+00,
       5.0000e+00])

### 5. Нормализация для получения вероятностей

In [23]:
probs = probs ** 0.75
sum = np.sum(probs)
probs = probs / sum

In [24]:
np.sum(probs)

0.9999999999999997

### 6. Замена документов на набор n-грамм

In [25]:
num_texts = []
for texts in all_texts:
    num_texts.append(len(texts))
num_texts

[15000, 10000, 25000, 2000, 8599, 50000]

In [26]:
for i in range(1, len(all_texts)):
    num_texts[i] += num_texts[i - 1]
num_texts # номер в массиве, до которого идет каждый датасет

[15000, 25000, 50000, 52000, 60599, 110599]

In [27]:
total_texts = num_texts[len(all_texts) - 1]
total_texts

110599

In [28]:
one_all_texts = []
for texts in all_texts:
    one_all_texts.extend(texts)
    print(len(one_all_texts))

15000
25000
50000
52000
60599
110599


In [29]:
docs = np.empty(total_texts, dtype=np.ndarray)

In [30]:
%%time
for i in range(0, total_texts):
    docgrams = []
    text = one_all_texts[i]
    for word in text:
        if word in ngrams:
            docgrams.append(ngrams[word])
    for j in range(1, len(text)):
        tupl = (text[j - 1], text[j])
        if tupl in ngrams:
            docgrams.append(ngrams[tupl])
    for j in range(2, len(text)):
        tupl = (text[j - 2], text[j - 1], text[j])
        if tuple in ngrams:
            docgrams.append(ngrams[tupl])
    docs[i] = np.array(docgrams, dtype=int)

Wall time: 1min 38s


In [31]:
docs[1]

array([  263,   264,    18,   265,     6,   266,   267,   234,   268,
         269,   237,   270,   271,   272,   232,   273,   274,   275,
          90,    92,   276,   171,    53,    18,   277,   278,   277,
         279,   197,   280,   281,    76,   282,   283,    11,   284,
         271,   232,   285,   286,   271, 61931, 61932, 61933, 61934,
       61935, 61936, 61937, 61938, 61939, 61940, 61941, 61942, 61943,
       61944, 61945, 61946, 61947, 61948, 61949, 61950, 61951, 61952,
       61953, 61954, 61955, 61956, 61957, 61958, 61959, 61960, 61961,
       61962, 61963, 61964, 61965, 61937, 61966, 61967, 61968, 61969,
       61970, 61971, 61972, 61973, 61974, 61975, 61976, 61977, 61978,
       61979, 61980, 61981, 61982, 61983, 61984, 61985, 61986, 61987,
       61988, 61989, 61990])

### 7. Перемешать данные 

In [32]:
def shuffle_data(docs):
    total_texts = docs.size
    idxs = np.arange(total_texts)
    np.random.shuffle(idxs)
    
    for i in range(0, total_texts):
        np.random.shuffle(docs[i])
        
    docs_ids = np.empty(total_texts, dtype=np.ndarray)
    sh_docs = docs[idxs] # возьмем документы в порядке, заданном индексами в idxs
    for i in range(0, total_texts): # idxs[i] - настоящий индекс текущего документа
        docs_ids[i] = np.ones(sh_docs[i].size, dtype=int) * idxs[i]

    sh_docs = np.concatenate(sh_docs) # список индексов нграмм всех документов в одномерном массиве
    docs_ids = np.concatenate(docs_ids) # список индексов документов
    
    return sh_docs, docs_ids

In [33]:
%%time
sh_docs, docs_ids = shuffle_data(docs)

Wall time: 13.6 s


In [34]:
docs[1], docs_ids.size, sh_docs.size

(array([   76,   266,   281,   237, 61965, 61990,   286, 61969, 61985,
          271, 61968, 61961, 61953,   232, 61936,   270, 61950, 61974,
        61940, 61937, 61989,   232,    18, 61932,   234, 61939, 61938,
        61980, 61947, 61935, 61981, 61934, 61978, 61956,   274,   265,
           92,   197,    18, 61975, 61970, 61941,   284, 61954,   268,
        61967, 61960, 61943, 61976,   282, 61983, 61949,   264,   269,
        61986,   279, 61946, 61973, 61971, 61948, 61982, 61942, 61966,
          267,   263,   278,   171,   272,   283, 61944,   277, 61933,
          271, 61951, 61931, 61984, 61977,    53, 61964, 61962, 61979,
            6, 61972,   276,   277, 61963, 61987, 61955, 61952,   273,
        61937,   275, 61958,    11,   280, 61945,   271, 61988, 61959,
           90,   285, 61957]), 38628924, 38628924)

### 8-10. Разбить данные на батчи, сэмплировать негативные примеры, генератор батчей

In [35]:
def batch_generator(grams, probs, docs_ids, neg_samples, nb = 2, batch_size = 100):
    cur_batch = 0
    grams_sz = grams.size
    neg_sz = grams_sz * nb
    
    sh_idxs = np.arange(grams_sz)
    np.random.shuffle(sh_idxs)
    
    for i in range(0, grams_sz, batch_size):
        sz = min(batch_size, grams_sz - i)
        idxs = sh_idxs[i : i + sz]
        yield grams[idxs], docs_ids[idxs], np.ones(sz)   # заменить ли на 1 вместо массива единиц, и 0 соотв-но
        for k in range(nb):
            yield neg_samples[cur_batch : cur_batch + sz], docs_ids[idxs], np.zeros(sz) # nb раз создает негативные батчи
            cur_batch += sz
        

### 11. Класс Doc2Vec

In [36]:
def sigmoid(x):
    sgm = 1. / (1 + np.exp(-x))
    n = (sgm <= 0.0).astype(int) * 0.00000001
    sgm += n
    n = (sgm >= 1.0).astype(int) * 0.00000001
    sgm -= n
    return sgm

In [37]:
class Doc2Vec:
    def __init__(self, vocab_size, docs_size, embed_size=500):
        self.word_embs = np.random.uniform(-0.001, 0.001, (vocab_size, embed_size))
        self.doc_embs = np.random.uniform(-0.001, 0.001, (docs_size, embed_size))
    
    def train(self, words_idxs, docs_idxs, labels, eta=0.2):
                    
        words_batch = self.word_embs[words_idxs]
        docs_batch = self.doc_embs[docs_idxs]
        
        scal_pr = np.sum(words_batch * docs_batch, axis=1)
        sigm = sigmoid(scal_pr)
        loss = np.sum(- labels * np.log(sigm) - (1 - labels) * np.log(1 - sigm))
        
        koefs = np.diagflat(- labels + sigm)
        grad_w = np.dot(koefs, docs_batch)
        grad_d = np.dot(koefs, words_batch)
        
        words_batch -= eta * grad_w
        docs_batch -= eta * grad_d
        
        self.word_embs[words_idxs] = words_batch
        self.doc_embs[docs_idxs] = docs_batch
        
        return loss
        '''
        # код если бы обрабатывали по 1 паре (нграмма, документ)
       # eta = 0.2
        
        batch_size = words_idxs.size
        
        losses = 0.0
        
        for i in range(batch_size):
            wid = words_idxs[i]
            did = docs_idxs[i]
            
            scalar = np.dot(self.word_embs[wid], self.doc_embs[did])
            sigm = sigmoid(scalar)
            loss = - labels[i] * np.log(sigm) - (1 - labels[i]) * np.log(1 - sigm)
            losses += loss
            
            koef = - labels[i] + sigm
            grad_w = koef * self.doc_embs[did]
            grad_d = koef * self.word_embs[wid]
            
            self.word_embs[wid] -= eta * grad_w
            self.doc_embs[did] -= eta * grad_d
        
        return losses
    '''
    def classify(self, words_idxs, docs_idxs):
        words_batch = self.word_embs[words_idxs]
        docs_batch = self.doc_embs[docs_idxs]
        
        scal_pr = np.sum(words_batch * docs_batch, axis=1)
        sigm = sigmoid(scal_pr)
        labels = (sigm >= 0.5).astype(int)
             
        return labels    

In [38]:
%%time
doc2vec = Doc2Vec(probs.size, total_texts)

Wall time: 16.5 s


In [39]:
#train_ids, train_texts, train_labels = part2xy['dev-b']
cnt = 0
t_labels = []
num_labeled = []
for item in part2xy:
    if part2xy[item][2] != None:
        t_labels.extend(part2xy[item][2])
        num_labeled.append(cnt)
        print(len(t_labels), num_labeled)
    cnt += 1


15000 [0]
25000 [0, 1]
27000 [0, 1, 3]


In [40]:
train_labels = np.array(train_labels)
t_labels = np.array(t_labels)

train_labels, t_labels

(array(['neg', 'pos', 'neg', ..., 'neg', 'pos', 'pos'], dtype='<U3'),
 array(['neg', 'pos', 'neg', ..., 'neg', 'neg', 'neg'], dtype='<U3'))

In [41]:
train_labels = (train_labels == 'pos').astype(int)
t_labels = (t_labels == 'pos').astype(int)

train_labels, t_labels

(array([0, 1, 0, ..., 0, 1, 1]), array([0, 1, 0, ..., 0, 0, 0]))

In [None]:
%%time
max_epoch = 15
eta = 0.2
d = 1.
for epoch in range(max_epoch):
    print('epoch', epoch + 1)
    nb = 2
    eta = eta / 1.5
    print('eta =', eta)
    
    sh_docs, docs_ids = shuffle_data(docs)
    neg_samples = np.random.choice(len(probs), size=sh_docs.size*nb, p=probs)
    
    batch_gen = batch_generator(sh_docs, probs, docs_ids, neg_samples, nb)
    for batch in batch_gen:
        doc2vec.train(batch[0], batch[1], batch[2], eta)
    
    classifier = LogisticRegression(solver='lbfgs', random_state=0)
    t_docs = doc2vec.doc_embs[0:num_texts[1]]
    t_docs = np.vstack([t_docs, doc2vec.doc_embs[num_texts[2] : num_texts[3]]])
    classifier.fit(doc2vec.doc_embs[0:num_texts[0]], train_labels)

   # preds = classifier.predict(t_docs)
    print('train:', classifier.score(t_docs[0:num_texts[0]], t_labels[0:num_texts[0]]))
    print('dev:', classifier.score(t_docs[num_texts[0]:num_texts[1]], t_labels[num_texts[0]:num_texts[1]]))
    print('dev-b:', classifier.score(t_docs[num_texts[1]:], t_labels[num_texts[1]:]))

    print('\n Score:', classifier.score(t_docs, t_labels), '\n')

epoch 1
eta = 0.13333333333333333




train: 0.8258666666666666
dev: 0.8142
dev-b: 0.67

 Score: 0.81 

epoch 2
eta = 0.08888888888888889
train: 0.8766
dev: 0.8601
dev-b: 0.733

 Score: 0.8598518518518519 

epoch 3
eta = 0.05925925925925926
train: 0.8972666666666667
dev: 0.8793
dev-b: 0.7555

 Score: 0.8801111111111111 

epoch 4
eta = 0.03950617283950617
train: 0.9042666666666667
dev: 0.8915
dev-b: 0.7605

 Score: 0.8888888888888888 

epoch 5
eta = 0.02633744855967078
train: 0.9099333333333334
dev: 0.8928
dev-b: 0.771

 Score: 0.8932962962962963 

epoch 6
eta = 0.017558299039780522
train: 0.9109333333333334
dev: 0.8941
dev-b: 0.7735

 Score: 0.8945185185185185 

epoch 7
eta = 0.011705532693187014
train: 0.9120666666666667
dev: 0.8947
dev-b: 0.774

 Score: 0.8954074074074074 

epoch 8
eta = 0.007803688462124676
train: 0.9116666666666666
dev: 0.8951
dev-b: 0.774

 Score: 0.8953333333333333 

epoch 9
eta = 0.005202458974749784
train: 0.9123333333333333
dev: 0.8968
dev-b: 0.7755

 Score: 0.8964444444444445 

epoch 10
eta = 0.0

In [43]:
df = pd.DataFrame(doc2vec.doc_embs) # сохраняю для исследования

In [62]:
df.to_csv('doc_embs.csv', sep=',', header=False, index=None)

In [67]:
df = pd.DataFrame(doc2vec.word_embs)

In [71]:
%%time
df.to_csv('word_embs.csv', sep=',', header=False, index=None)

Wall time: 44min 13s


In [66]:
# на случай если хотим проверить на текущих эмбеддингах как обучается
classifier = LogisticRegression(solver='lbfgs', random_state=0)
t_docs = doc2vec.doc_embs[0:num_texts[1]]
t_docs = np.vstack([t_docs, doc2vec.doc_embs[num_texts[2] : num_texts[3]]])
classifier.fit(doc2vec.doc_embs[0:num_texts[0]], train_labels)

# preds = classifier.predict(t_docs)
print('score:', classifier.score(t_docs, t_labels), '\n')

score: 0.8846666666666667 





In [73]:
from fire import Fire
from time import time

In [78]:
# код из evaluate, запускаю только когда результат лучше существующего

del part2xy["train_unlabeled"] # после первого запуска комментирую эту строку
PREDS_FNAME = 'preds.tsv'
allpreds = []
for part, (ids, x, y) in part2xy.items():
    print('\nClassifying %s set with %d examples ...' % (part, len(x)))
    st = time()
    texts_sz = [0, 15000, 10000, 25000, 2000, 8599]
    idx = texts_sz.index(len(x))
    for i in range(1, 6):
        texts_sz[i] += texts_sz[i - 1]

    preds = classifier.predict(doc2vec.doc_embs[texts_sz[idx - 1] : texts_sz[idx]])
    preds = preds.astype(object)
    for i in range(preds.size):
        if preds[i] == 1:
            preds[i] = 'pos'
        else:
            preds[i] = 'neg'
        
    print('%s set classified in %.2fs' % (part, time() - st))
    allpreds.extend(zip(ids, preds))

    if y is None:
        print('no labels for %s set' % part)
    else:
        score(preds, y)

save_preds(allpreds, preds_fname=PREDS_FNAME)
print('\nChecking saved predictions ...')
score_preds(preds_fname=PREDS_FNAME, data_dir='FILIMDB')


Classifying train set with 15000 examples ...
train set classified in 0.03s
Number of correct/incorrect predictions: 13603/15000

Classifying dev set with 10000 examples ...
dev set classified in 0.05s
Number of correct/incorrect predictions: 8878/10000

Classifying test set with 25000 examples ...
test set classified in 0.05s
no labels for test set

Classifying dev-b set with 2000 examples ...
dev-b set classified in 0.00s
Number of correct/incorrect predictions: 1569/2000

Classifying test-b set with 8599 examples ...
test-b set classified in 0.02s
no labels for test-b set
Predictions saved to preds.tsv

Checking saved predictions ...
Loading train set 
neg 7480
pos 7520
Loading dev set 
neg 5020
pos 4980
Loading test set 
unlabeled 25000
Loading dev-b set 
pos 994
neg 1006
Loading test-b set 
unlabeled 8599
Number of correct/incorrect predictions: 13603/15000
train set accuracy: 90.69
Number of correct/incorrect predictions: 8878/10000
dev set accuracy: 88.78
no labels for test set

{'train': 90.68666666666667, 'dev': 88.78, 'dev-b': 78.45}