In [1]:
import os
import re
from random import randint

import gensim
from gensim.models import word2vec



In [2]:
# data_folder = os.path.join('data', '00_dec')
data_folder = os.path.join('data', '17_dec')

In [3]:
word2vec_folder = 'w2v_files'

In [4]:
# raw_log_file = os.path.join(data_folder, 'log_dratuti.txt')
clean_log_file = os.path.join(data_folder, 'clean_log_dratuti.txt')
raw_msgs_file = os.path.join(data_folder, 'raw_msgs.txt')
ltrs_only_msgs_file = os.path.join(data_folder, 'ltrs_only_msgs.txt')
no_stopwords_msgs_file = os.path.join(data_folder, 'no_stopwords_msgs.txt')
normalized_no_stopwords_msgs_file = os.path.join(data_folder, 'normalized_no_stopwords_msgs.txt')

In [5]:
def write_msgs(msgs, filename):
    with open(filename, 'w') as lf:
        lf.write('\n'.join([' '.join(msg) for msg in msgs]))
def read_msgs(filename):
    with open(filename) as lf:
        msgs = [line.split() for line in lf.readlines()]
    return msgs

# `Word2vec`

In [6]:
def print_similar_words(words, model):
    for w in words:
        print('--- ' + w + ' ---')
        for sim_w in model.similar_by_word(w):
            print('{} ({})'.format(sim_w[0], round(sim_w[1], 4)))
            # print(sim_w[0])
        print()

test_words_1 = ['пидор', 'хуй', 'тупка', 'wtf', 'няшка']
test_words_names = ['валентин', 'юра', 'тимофей']
test_words_synon = ['пидор', 'пидр', 'гей']

# Just messages

In [7]:
msgs = read_msgs(ltrs_only_msgs_file)

In [8]:
# %%time

num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10           # Context window size
downsampling = 0.5   # Downsample setting for frequent words

model = word2vec.Word2Vec(msgs, 
                          workers = num_workers, 
                          size = num_features, 
                          min_count = min_word_count,
                          window = context, 
                          sample = downsampling)


model.save(os.path.join(word2vec_folder, 'model_w2v'))
print(model.syn0.shape)

(8504, 300)


In [9]:
print_similar_words(test_words_1, model)
print_similar_words(test_words_names, model)
print_similar_words(test_words_synon, model)

--- пидор ---
виктор (0.9897)
заебал (0.9784)
вербов (0.9741)
ниоч (0.9727)
котейка (0.9722)
борис (0.9717)
солнышко (0.9714)
расскажи (0.9682)
инфа (0.9643)
баг (0.9626)

--- хуй ---
вопрос (0.9435)
сука (0.9264)
денег (0.9182)
уж (0.9176)
ней (0.9143)
пишешь (0.9141)
грамматику (0.9124)
нормально (0.9108)
мой (0.91)
бонусы (0.9084)

--- тупка ---
аня (0.9368)
няшка (0.9348)
тимофей (0.8865)
споки (0.8811)
поговорите (0.88)
юра (0.8773)
солнышко (0.8772)
сучка (0.8746)
валентин (0.8746)
иисусе (0.8741)

--- wtf ---
mp (0.993)
root (0.9925)
lets (0.9917)
button (0.9916)
post (0.9915)
module (0.9915)
law (0.9913)
media (0.9912)
option (0.991)
things (0.9909)

--- няшка ---
тимофей (0.963)
дерзкий (0.9606)
босс (0.9527)
грубый (0.9505)
ах (0.949)
находишь (0.9445)
слишком (0.9392)
споки (0.939)
борис (0.9369)
любишь (0.9366)

--- валентин ---
бооом (0.9826)
эй (0.9747)
транс (0.9745)
прекращай (0.9737)
манда (0.9691)
крашеная (0.9686)
велик (0.9677)
трактор (0.963)
оригинальный (0.9624)


# No stopwords

In [10]:
msgs_nosw = read_msgs(no_stopwords_msgs_file)

In [11]:
# %%time

num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10           # Context window size
downsampling = 0.5   # Downsample setting for frequent words

model_nosw = word2vec.Word2Vec(msgs_nosw, 
                                workers = num_workers, 
                                size = num_features, 
                                min_count = min_word_count,
                                window = context, 
                                sample = downsampling)


model_nosw.save(os.path.join(word2vec_folder, 'model_w2v_no_stopwords'))
print(model_nosw.syn0.shape)

(8353, 300)


In [12]:
print_similar_words(test_words_1, model_nosw)
print_similar_words(test_words_names, model_nosw)
print_similar_words(test_words_synon, model_nosw)

--- пидор ---
чот (0.9998)
хотят (0.9998)
брать (0.9997)
еду (0.9997)
та (0.9996)
хватит (0.9996)
всем (0.9996)
попробуй (0.9995)
лал (0.9995)
бесит (0.9995)

--- хуй ---
знает (0.9998)
нужен (0.9997)
сразу (0.9997)
порно (0.9997)
лишь (0.9996)
ебать (0.9996)
анус (0.9996)
музыка (0.9996)
ладно (0.9996)
какие (0.9996)

--- тупка ---
няшка (0.9993)
мид (0.9988)
шеф (0.9987)
катя (0.9981)
помоги (0.998)
магаз (0.9976)
голубцов (0.9976)
игрок (0.9975)
секси (0.9974)
поехавший (0.9971)

--- wtf ---
scala (0.9997)
go (0.9996)
errors (0.9996)
already (0.9995)
person (0.9995)
author (0.9995)
insert (0.9995)
html (0.9994)
black (0.9994)
null (0.9994)

--- няшка ---
мид (0.9994)
тупка (0.9993)
шеф (0.9991)
катя (0.9987)
помоги (0.9986)
кушать (0.9982)
geekbrains (0.998)
секси (0.998)
голубцов (0.998)
игрок (0.9979)

--- валентин ---
иван (0.9997)
бар (0.9995)
пятницу (0.9995)
субботу (0.9995)
думает (0.9994)
тобой (0.9994)
обладает (0.9994)
твоей (0.9994)
дай (0.9993)
мной (0.9993)

--- юра ---

## Normalized with `pymorphy2`, no stopwords

In [13]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_norm(word, morph_analyzer = morph):
    if word == 'чай':
        return word
    
    norm = morph.parse(word)[0].normal_form
    
    if norm in ['пидора', 'пидра', 'вегана']:
        return norm[:-1]
    
    if norm == 'нихуй':
        return 'нихуя'
    
    if norm == 'хуйль':
        return 'хуйли'
    
    if norm == 'хула':
        return 'хуле'
    
    if norm == 'штол':
        return 'штоле'
    
    if norm == 'гея':
        return 'гей'
    
    return norm

def normalize_msg(msg):
    if type(msg) == str:
        msg = msg.split()

    for w in msg:
        norm_w = to_norm(w)
        yield norm_w

In [14]:
msgs_nosw_norm = read_msgs(normalized_no_stopwords_msgs_file)

In [15]:
# %%time

num_features = 300    # Word vector dimensionality
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10           # Context window size
downsampling = 0.5   # Downsample setting for frequent words

model_nosw_norm = word2vec.Word2Vec(msgs_nosw_norm, 
                                     workers = num_workers, 
                                     size = num_features, 
                                     min_count = min_word_count,
                                     window = context, 
                                     sample = downsampling)


model_nosw_norm.save(os.path.join(word2vec_folder, 'model_w2v_nosw_norm'))
print(model_nosw_norm.syn0.shape)

(7141, 300)


In [16]:
print_similar_words(list(map(to_norm, test_words_1)), model_nosw_norm)
print_similar_words(list(map(to_norm, test_words_names)), model_nosw_norm)
print_similar_words(list(map(to_norm, test_words_synon)), model_nosw_norm)

--- пидор ---
чатик (0.9997)
чот (0.9994)
телеграм (0.9994)
блядь (0.9992)
сиг (0.9992)
орать (0.9992)
хуйн (0.9992)
котейка (0.9992)
пофига (0.9992)
ща (0.9991)

--- хуй ---
нравиться (0.999)
интересно (0.9989)
жить (0.9989)
никто (0.9988)
видеть (0.9988)
твой (0.9988)
норма (0.9986)
лола (0.9986)
обсуждать (0.9985)
написать (0.9984)

--- тупка ---
плз (0.9993)
кончиться (0.9991)
магаз (0.9991)
кот (0.9991)
дот (0.9991)
кушать (0.9991)
игрок (0.999)
бухать (0.999)
конф (0.999)
покер (0.999)

--- wtf ---
oh (0.9997)
scala (0.9997)
already (0.9996)
private (0.9996)
data (0.9996)
go (0.9995)
times (0.9995)
idea (0.9995)
high (0.9995)
bn (0.9995)

--- няшка ---
голубец (0.9997)
шеф (0.9997)
бомж (0.9996)
опустить (0.9995)
отпиздить (0.9995)
мид (0.9995)
косарь (0.9994)
форсить (0.9994)
бухать (0.9993)
дот (0.9993)

--- валентин ---
гриб (0.9996)
пивко (0.9995)
продаваться (0.9995)
борис (0.9995)
бить (0.9995)
йо (0.9995)
мороженое (0.9995)
утром (0.9995)
гайза (0.9995)
гулять (0.9994)

--

# Messages with words from the last `w2v` model

In [17]:
print(len(msgs))
print(len(msgs_nosw))
print(len(msgs_nosw_norm))

79307
75338
75338


In [18]:
ws = list(model_nosw_norm.vocab.keys())
print(len(ws))

7141


In [19]:
# msgs_with_popular = list(set([(sum(w in ws for w in msg)*100//len(msg), ' '.join(msg)) for msg in msgs_nosw_norm]))

msgs_with_popular = list(set([(sum(w in ws for w in msg)*100//len(msg), ' '.join(msg)) for msg in msgs_nosw_norm]))

In [20]:
msgs_with_popular.sort(reverse = True)

In [21]:
# msgs_with_popular[:20]

# msgs_with_popular[-20:]

-------

In [22]:
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(analyzer = "word",
#                              tokenizer = None,
#                              preprocessor = None,
#                              stop_words = sw,
#                              max_features = 1000)

# %%time
# data_features = vectorizer.fit_transform(text_ltrs)

# feats = data_features.toarray()

# sum(sum(feats))

# vectorizer.get_feature_names()[-5:]