# Author Verification Using Common N-Gram Profiles of Text Documents #
### Создатели: Алла Горбунова, Лика Джиоева, Евгения Егорова, Елизавета Клыкова и Яна Шишкина 
#### C опорой на (Magdalena Jankowska, Evangelos Milios & Vlado Kešelj, 2014) ####

В папке с тетрадкой должны быть следующие папки:
* answers - содержит файлы truth.txt и truth-test.txt
* texts_train - содержит папки с данными для обучения
* texts_test - содержит папки с тестовыми данными

In [1]:
import os
import re
import pandas as pd
from pprint import pprint
from tqdm.auto import tqdm
from collections import Counter, OrderedDict

import nltk
from nltk import WordNetLemmatizer, ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

import string
from string import punctuation

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_texts(directory):  # получает тексты и информацию о них
    dirs = [directory + d + '/' for d in os.listdir(directory)]
    texts_info = []
    for d in dirs:
        files = [d + f for f in os.listdir(d)]
        lang = re.search('[A-Z]+', d).group().lower()
        author = re.search('[0-9]+', d.lower()).group()
        for f in files:
            info = []
            info.append(author)
            info.append(lang)
            with open(f, encoding='utf-8-sig') as f1:
                text = f1.read()
                info.append(text)
                info.append(len(text.split()))
            texts_info.append(info)
    return texts_info

In [3]:
def make_df(texts):  # создает датафрейм
    columns = ['author', 'lang', 'text', 'length']
    df = pd.DataFrame(texts, columns=columns)
    df.index.name = 'id'
    return df

In [4]:
def preprocess(text):  # возвращает список слов без пунктуации
    punct = list(re.sub("[-']", '', string.punctuation))
    other_punct = ['``', '\"\"', '...', '--', '–', '—',
                   '«', '»', '“', '”', '’', '***', '…', '•']
    all_punct = punct + other_punct
    rx = '[' + re.escape(''.join(all_punct)) + ']'
    word_list = word_tokenize((re.sub(rx, ' ', text.lower())))
    return word_list

In [5]:
def add_freq(ngrams):  # возвращает частотности n-грамм текста
    for key, num in ngrams.items():
        freq = num / len(ngrams)
        ngrams[key] = freq
    return ngrams

In [6]:
def make_word_ngrams(words):  # возвращает 1-2-3-граммы слов
    word_unigrams = dict(Counter(words).most_common())
    freq_unigrams = add_freq(word_unigrams)

    bigrams = [' '.join(b) for b in list(nltk.bigrams(words))]
    freq_bigrams = add_freq(dict(Counter(bigrams).most_common()))

    trigrams = [' '.join(b) for b in list(nltk.trigrams(words))]
    freq_trigrams = add_freq(dict(Counter(trigrams).most_common()))

    return freq_unigrams, freq_bigrams, freq_trigrams

In [7]:
def make_char_ngrams(text, n):  # возвращает 3-7-граммы символов
    my_ngrams = [''.join(ng) for ng in list(ngrams(text, n))]
    freq_ngrams = add_freq(dict(Counter(my_ngrams).most_common()))
    return freq_ngrams

#### Впомогательные функции
* Обрезка по минимальной длине профиля
* Заполнение нулями отсутствующих слов

In [8]:
def cut_profile(d, all_docs):  # обрезает все профили по самому короткому
    lens = [len(ngrams) for ngrams in all_docs]
    num = min(lens)
    d_cut = dict(Counter(d).most_common(num))
    return d_cut

In [9]:
def add_zeros(d, keys):  # заменяет пустые частотности на 0.0
    dif_keys = keys - set(d.keys())
    for d_key in dif_keys:
        d[d_key] = 0.0
    return d

In [10]:
def difference(d1_, d2_):  # вычисляет difference
    d1 = d1_.copy()
    d2 = d2_.copy()
    keys = set(d1.keys()) | set(d2.keys())
    add_zeros(d1, keys)
    add_zeros(d2, keys)
    summ = 0
    for key in keys:
        summ = summ + (2 * (d1[key] - d2[key]) / (d1[key] + d2[key])) ** 2
    return summ

In [11]:
def find_max_dif(known_docs_cut):  # вычисляет Dmax
    max_list = []
    for d_target in known_docs_cut:
        difs = [difference(d_target, d) for d in known_docs_cut]
        max_list.append(max(difs))
    return max_list

In [12]:
def find_mean_ratio(known_docs, unknown_doc):  # средний коэф. различия
    known_docs_cut = [cut_profile(d, known_docs) for d in known_docs]
    max_list = find_max_dif(known_docs_cut)
    dif_unknown = [difference(unknown_doc, d) for d in known_docs_cut]

    ratios = []
    for i in range(len(max_list)):
        r = dif_unknown[i]/max_list[i]
        ratios.append(r)

    return np.mean(ratios)

In [13]:
def classify(known_profile, unknown_profile, theta):  # выдает ответ
    mean_r = find_mean_ratio(known_profile, unknown_profile)
    if mean_r <= theta:
        return 'Y', mean_r
    else:
        return 'N', mean_r

### Основная функция

In [14]:
def verify_authorship(batch, theta, n_char=None, n_word=1):
    known_texts = batch.iloc[:-1].text
    if len(known_texts) == 1:
        text1 = known_texts.iloc[0][:len(known_texts.iloc[0])//2]
        text2 = known_texts.iloc[0][len(known_texts.iloc[0])//2:]
        known_texts = [text1, text2]
    known_words = [preprocess(text) for text in known_texts]

    d_char_ngrams = []
    if n_char is not None:
        for text in known_texts:
            d_char_ngrams.append(make_char_ngrams(text, n_char))

    d_unigrams, d_bigrams, d_trigrams = [], [], []
    for i, d in enumerate(known_words):
        unigrams, bigrams, trigrams = make_word_ngrams(d)
        d_unigrams.append(unigrams)
        d_bigrams.append(bigrams)
        d_trigrams.append(trigrams)
    d_word_ngrams = [d_unigrams, d_bigrams, d_trigrams]

    unknown_text = batch.iloc[-1].text
    unknown_words = preprocess(unknown_text)
    u_word_ngrams = make_word_ngrams(unknown_words)
    if n_char is not None:
        u_char_ngrams = make_char_ngrams(unknown_text, n_char)

    if n_char is not None:
        answer, mean_r = classify(d_char_ngrams, u_char_ngrams, theta)
    else:
        answer, mean_r = classify(
            d_word_ngrams[n_word - 1], u_word_ngrams[n_word - 1], theta)
    return answer, mean_r

#### Функции для обучения и предсказания
\+ make_batches для выделения групп

In [15]:
def make_batches(lang_df):  # создает список папок на одном языке
    batches = []

    authors = sorted(list(set(lang_df['author'].values)))
    for author in authors:
        batch_df = lang_df[lang_df['author'] == author]
        batches.append(batch_df)
    return batches

In [16]:
def train(df, answers, n_chars=None, n_words=1):  # подбор теты
    batches = make_batches(df)
    X = []
    for batch in batches:
        answer, mean_r = verify_authorship(batch, 1, n_chars, n_words)
        X.append(mean_r)
    X = np.array(X)
    X = X[:, np.newaxis]

    y = np.array(answers)
    y[y == 'Y'] = 1
    y[y == 'N'] = 0
    y = y.astype(int)
    clf = LogisticRegression(random_state=0).fit(X, y)
    theta = (clf.intercept_/-clf.coef_)[0][0]
    return theta

In [17]:
def predict(df, theta, n_chars=None, n_words=1):  # выдает ответы
    batches = make_batches(df)
    predictions = []
    for batch in batches:
        answer, mean_r = verify_authorship(batch, theta, n_chars, n_words)
        predictions.append(answer)
    return predictions

#### Перебор типов n-грамм и подсчет accuracy

In [18]:
texts = get_texts('./texts_train/')
df = make_df(texts)

test_texts = get_texts('./texts_test/')
test_df = make_df(test_texts)

In [19]:
def get_answers(truth_file):  # получает ответы из файла
    with open(truth_file, 'r', encoding='utf-8-sig') as f:
        truth_text = f.read()

    answers = []
    answers_temp = truth_text.split('\n')
    for ans in answers_temp:
        if ans != '':
            answers.append(ans.split()[1])
    return answers

In [20]:
def evaluate(lang_df, test_lang_df, answers, answers_test, eval_type):
    # eval_type = 'char' or 'word'
    if eval_type == 'char':
        r_start = 3
        r_finish = 11
        arg = 'n_chars'
    if eval_type == 'word':
        r_start = 1
        r_finish = 4
        arg = 'n_words'
    preds = []
    for i in range(r_start, r_finish):
        print(f'Train on {eval_type} {i}-grams...')
        kwarg = {arg : i}
        theta = train(lang_df, answers, **kwarg)
        predictions = predict(lang_df, theta, **kwarg)
        accuracy = accuracy_score(answers, predictions)
        print(f'Accuracy on train:\t{(accuracy*100):.2f}%')
        print(f'Theta:\t\t\t{theta:.3f}')

        predictions = predict(test_lang_df, theta, **kwarg)
        accuracy = accuracy_score(answers_test, predictions)
        print(f'Accuracy on test:\t{(accuracy*100):.2f}%')
        preds.append(predictions) # это тестовые
    return preds

In [21]:
lang_list = sorted(list(set(df['lang'].values)))
print(lang_list)

['en', 'gr', 'sp']


In [22]:
answers = get_answers('./answers/truth.txt')
answers_test = get_answers('./answers/truth-test.txt')

In [23]:
def lang_testing(lang, lang_short, slicing, df, test_df, eval_type):
    print(f'Evaluating on {lang} data...\n')
    lang_df = df[df['lang'] == lang_short]
    lang_test_df = test_df[test_df['lang'] == lang_short]
    preds = evaluate(lang_df,
                     lang_test_df,
                     answers[slicing[0]],
                     answers_test[slicing[1]],
                     eval_type)
    return preds

In [24]:
def ensemble(lang, all_preds, answers_test):
    # тестирует ансабль классификаторов
    print(f'Calculating ensembles on {lang} data...\n')
    preds = np.array(all_preds).T
    best_preds = [Counter(pred).most_common(1)[0][0] for pred in preds]
    accuracy = accuracy_score(answers_test, best_preds)
    print(f'Accuracy on test:\t{(accuracy*100):.2f}%')

In [25]:
# English
all_preds_char = lang_testing(
    'English', 'en', [slice(None, 10), slice(None, 30)], df, test_df, 'char')
all_preds_word = lang_testing(
    'English', 'en', [slice(None, 10), slice(None, 30)], df, test_df, 'word')
ensemble('English char', all_preds_char, answers_test[:30])
ensemble('English word', all_preds_word, answers_test[:30])

Evaluating on English data...

Train on char 3-grams...
Accuracy on train:	80.00%
Theta:			1.103
Accuracy on test:	60.00%
Train on char 4-grams...
Accuracy on train:	80.00%
Theta:			1.077
Accuracy on test:	60.00%
Train on char 5-grams...
Accuracy on train:	70.00%
Theta:			1.068
Accuracy on test:	63.33%
Train on char 6-grams...
Accuracy on train:	70.00%
Theta:			1.063
Accuracy on test:	63.33%
Train on char 7-grams...
Accuracy on train:	70.00%
Theta:			1.058
Accuracy on test:	63.33%
Train on char 8-grams...
Accuracy on train:	70.00%
Theta:			1.054
Accuracy on test:	60.00%
Train on char 9-grams...
Accuracy on train:	70.00%
Theta:			1.048
Accuracy on test:	60.00%
Train on char 10-grams...
Accuracy on train:	70.00%
Theta:			1.044
Accuracy on test:	60.00%
Evaluating on English data...

Train on word 1-grams...
Accuracy on train:	70.00%
Theta:			1.105
Accuracy on test:	60.00%
Train on word 2-grams...
Accuracy on train:	70.00%
Theta:			1.045
Accuracy on test:	56.67%
Train on word 3-grams...
Ac

In [26]:
# Greek
all_preds_char = lang_testing(
    'Greek', 'gr', [slice(10, 30), slice(30, 60)], df, test_df, 'char')
all_preds_word = lang_testing(
    'Greek', 'gr', [slice(10, 30), slice(30, 60)], df, test_df, 'word')
ensemble('Greek char', all_preds_char, answers_test[30:60])
ensemble('Greek word', all_preds_word, answers_test[30:60])

Evaluating on Greek data...

Train on char 3-grams...
Accuracy on train:	75.00%
Theta:			1.144
Accuracy on test:	63.33%
Train on char 4-grams...
Accuracy on train:	85.00%
Theta:			1.185
Accuracy on test:	66.67%
Train on char 5-grams...
Accuracy on train:	85.00%
Theta:			1.202
Accuracy on test:	63.33%
Train on char 6-grams...
Accuracy on train:	80.00%
Theta:			1.210
Accuracy on test:	60.00%
Train on char 7-grams...
Accuracy on train:	80.00%
Theta:			1.214
Accuracy on test:	60.00%
Train on char 8-grams...
Accuracy on train:	80.00%
Theta:			1.216
Accuracy on test:	60.00%
Train on char 9-grams...
Accuracy on train:	75.00%
Theta:			1.216
Accuracy on test:	60.00%
Train on char 10-grams...
Accuracy on train:	75.00%
Theta:			1.215
Accuracy on test:	56.67%
Evaluating on Greek data...

Train on word 1-grams...
Accuracy on train:	85.00%
Theta:			1.224
Accuracy on test:	63.33%
Train on word 2-grams...
Accuracy on train:	75.00%
Theta:			1.217
Accuracy on test:	53.33%
Train on word 3-grams...
Accura

In [27]:
# Spanish
all_preds_char = lang_testing(
    'Spanish', 'sp', [slice(30, None), slice(60, None)], df, test_df, 'char')
all_preds_word = lang_testing(
    'Spanish', 'sp', [slice(30, None), slice(60, None)], df, test_df, 'word')
ensemble('Spanish char', all_preds_char, answers_test[60:])
ensemble('Spanish word', all_preds_word, answers_test[60:])

Evaluating on Spanish data...

Train on char 3-grams...
Accuracy on train:	60.00%
Theta:			2.205
Accuracy on test:	52.00%
Train on char 4-grams...
Accuracy on train:	60.00%
Theta:			2.260
Accuracy on test:	52.00%
Train on char 5-grams...
Accuracy on train:	60.00%
Theta:			2.322
Accuracy on test:	52.00%
Train on char 6-grams...
Accuracy on train:	60.00%
Theta:			2.362
Accuracy on test:	52.00%
Train on char 7-grams...
Accuracy on train:	60.00%
Theta:			2.362
Accuracy on test:	52.00%
Train on char 8-grams...
Accuracy on train:	60.00%
Theta:			2.375
Accuracy on test:	52.00%
Train on char 9-grams...
Accuracy on train:	60.00%
Theta:			2.396
Accuracy on test:	52.00%
Train on char 10-grams...
Accuracy on train:	60.00%
Theta:			2.408
Accuracy on test:	52.00%
Evaluating on Spanish data...

Train on word 1-grams...
Accuracy on train:	60.00%
Theta:			2.380
Accuracy on test:	52.00%
Train on word 2-grams...
Accuracy on train:	60.00%
Theta:			2.543
Accuracy on test:	52.00%
Train on word 3-grams...
Ac