In [9]:
import regex as re

In [45]:
from collections import Counter, OrderedDict

In [47]:
import numpy as np
import math

In [27]:
from stop_words import get_stop_words
stop_words = get_stop_words('russian')

In [28]:
court = open('court-V-N.csv', 'r')
ref = open('Sc_part1.txt', 'r', encoding='cp1251')

In [29]:
ref_lines = ref.readlines()
court_lines = court.readlines()

In [31]:
ref_tokens_ = re.findall('[А-Я]+?\.', ' '.join(ref_lines))

In [34]:
ref_tokens = []
for token in ref_tokens_:
    token = token.strip('.').lower()
    if token not in stop_words:
        ref_tokens.append(token)

In [49]:
len_ref = len(ref_tokens)

In [36]:
court_tokens = []
for line in court_lines:
    court_tokens.extend(line.split(','))

In [37]:
for i in range(len(court_tokens)):
    court_tokens[i] =  court_tokens[i].strip().lower()

In [50]:
len_court = len(court_tokens)

In [39]:
court_freq = Counter(court_tokens)
ref_freq = Counter(ref_tokens)

In [51]:
len_court, len_ref

(5100, 871777)

In [40]:
court_freq.most_common(10)

[('суд', 1047),
 ('решение', 93),
 ('признать', 81),
 ('иск', 63),
 ('дело', 56),
 ('удовлетворить', 56),
 ('арест', 53),
 ('вынести', 39),
 ('ходатайство', 38),
 ('год', 36)]

In [41]:
ref_freq.most_common(10)

[('россия', 11058),
 ('компания', 7416),
 ('рынок', 5857),
 ('проект', 5756),
 ('акция', 4023),
 ('москва', 3878),
 ('суд', 3816),
 ('украина', 3790),
 ('развитие', 3261),
 ('российский', 2999)]

In [61]:
def llhood_word(word, corp_counter, ref_counter, len_corp, len_ref):
    freq_word_corp = corp_counter[word]
    freq_word_ref = ref_counter[word]
    e_corp = len_corp*(freq_word_corp+freq_word_ref)/(len_corp+len_ref)
    e_ref = len_ref*(freq_word_corp+freq_word_ref)/(len_corp+len_ref)
    try:
        llhood = 2*((freq_word_corp*math.log(freq_word_corp/e_corp)) + (freq_word_ref*math.log(freq_word_ref/e_ref)))
    # по идее такого быть не должно, потому что будут использованы только слова, кот. есть в обоих корпусах
    except (ValueError, ZeroDivisionError) as e:
        return float('-inf')
    return llhood

In [130]:
def llhood(corp_counter, ref_counter, len_corp, len_ref):
    corp_ll_dict = {}
    for word in corp_counter:
        if word in ref_counter:
            corp_ll_dict[word] = llhood_word(word, corp_counter, ref_counter, len_corp, len_ref)   
    return OrderedDict(sorted(corp_ll_dict.items(), key=lambda t: t[1], reverse=True))

In [131]:
def weirdness_word(word, corp_counter, ref_counter, len_corp, len_ref):
    try:   
        return (corp_counter[word]/len_corp)/(ref_counter[word]/len_ref)
    except ZeroDivisionError:
        return 0

In [132]:
def weirdness(corp_counter, ref_counter, len_corp, len_ref):
    corp_weird_dict = {}
    for word in corp_counter:
        corp_weird_dict[word] = weirdness_word(word, corp_counter, ref_counter, len_corp, len_ref) 
    return OrderedDict(sorted(corp_weird_dict.items(), key=lambda t: t[1], reverse=True))

In [133]:
llhood_dict = llhood(court_freq, ref_freq, len_court, len_ref)

In [134]:
weirdness_dict = weirdness(court_freq, ref_freq, len_court, len_ref)

In [141]:
def get_likelihood_stats(num, which, llhood_dict, weirdness_dict, corp_freq, ref_freq):
    if which == 'first':
        needed = list(llhood_dict.items())[:num]
    else:
        needed = list(llhood_dict.items())[-num:]
    for pair in needed:
#         printlist = [str(pair[0]), str(corp_freq[pair[0]]), str(ref_freq[pair[0]]), str(pair[1]), 
#                     str(list(llhood_dict.keys()).index(pair[0])+1), str(weirdness_dict[pair[0]]),
#                     str(list(weirdness_dict.keys()).index(pair[0])+1)]
#         print('\t'.join(printlist))
        print('Word: '+str(pair[0]))
        print('CountSpec: '+str(corp_freq[pair[0]]))
        print('CountRef: '+str(ref_freq[pair[0]]))
        print('LogLikelihood: '+str(pair[1]))
        print('L Range: '+str(list(llhood_dict.keys()).index(pair[0])+1))
        print('Weirdness: '+str(weirdness_dict[pair[0]]))
        print('W Range: '+str(list(weirdness_dict.keys()).index(pair[0])+1)+'\n')

In [142]:
get_likelihood_stats(10, 'first', llhood_dict, weirdness_dict, court_freq, ref_freq)

Word: суд
CountSpec: 1047
CountRef: 3816
LogLikelihood: 5756.389170072415
L Range: 1
Weirdness: 46.90007599580713
W Range: 72

Word: признать
CountSpec: 81
CountRef: 380
LogLikelihood: 409.7054991385812
L Range: 2
Weirdness: 36.436499999999995
W Range: 87

Word: удовлетворить
CountSpec: 56
CountRef: 76
LogLikelihood: 397.415910333734
L Range: 3
Weirdness: 125.95333333333333
W Range: 19

Word: решение
CountSpec: 93
CountRef: 1272
LogLikelihood: 293.03638944164834
L Range: 4
Weirdness: 12.497727987421383
W Range: 254

Word: арест
CountSpec: 53
CountRef: 462
LogLikelihood: 209.6061316254691
L Range: 5
Weirdness: 19.609617604617604
W Range: 172

Word: иск
CountSpec: 63
CountRef: 880
LogLikelihood: 196.16263479900107
L Range: 6
Weirdness: 12.237511363636363
W Range: 256

Word: ходатайство
CountSpec: 38
CountRef: 242
LogLikelihood: 171.6250284329522
L Range: 7
Weirdness: 26.84129476584022
W Range: 128

Word: вынести
CountSpec: 39
CountRef: 370
LogLikelihood: 148.3233529721466
L Range: 8
Weir

In [143]:
get_likelihood_stats(10, 'last', llhood_dict, weirdness_dict, court_freq, ref_freq)

Word: раскрывать
CountSpec: 1
CountRef: 167
LogLikelihood: 0.0005355086840733922
L Range: 1250
Weirdness: 1.023572854291417
W Range: 1008

Word: появиться
CountSpec: 2
CountRef: 347
LogLikelihood: 0.0004427250139963576
L Range: 1251
Weirdness: 0.9852257444764648
W Range: 1016

Word: очередь
CountSpec: 2
CountRef: 337
LogLikelihood: 0.0004079042864255228
L Range: 1252
Weirdness: 1.0144609297725025
W Range: 1012

Word: лишить
CountSpec: 1
CountRef: 174
LogLikelihood: 0.00031553632261371867
L Range: 1253
Weirdness: 0.9823946360153256
W Range: 1017

Word: письмо
CountSpec: 1
CountRef: 168
LogLikelihood: 0.000296825323790717
L Range: 1254
Weirdness: 1.0174801587301587
W Range: 1009

Word: покушение
CountSpec: 1
CountRef: 168
LogLikelihood: 0.000296825323790717
L Range: 1255
Weirdness: 1.0174801587301587
W Range: 1010

Word: банковский
CountSpec: 1
CountRef: 168
LogLikelihood: 0.000296825323790717
L Range: 1256
Weirdness: 1.0174801587301587
W Range: 1011

Word: отдел
CountSpec: 1
CountRef: 1

In [150]:
by_hand = set(['суд', 'ходатайство', 'срок', 'арест', 'адвокат', 'иск', 'истец', 'уголовный', 'дело', 'подсудимый'])

In [151]:
top_llhood = set(list(llhood_dict.keys())[:10])

In [153]:
# precision
len(by_hand.intersection(top_llhood))/len(top_llhood)

0.4