In [1]:
import multiprocessing
import pickle
import re
from functools import lru_cache
from time import time
import numpy as np
import pandas as pd
import pymorphy2
from gensim.models import KeyedVectors
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from gensim.test.utils import datapath
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.spatial.distance import cosine
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

np.random.seed(42)

# Preprocessing

In [2]:
w2v_vectors = KeyedVectors.load_word2vec_format("185/model.bin", binary=True)

w2v_vectors.vocab = dict(zip(list(map(lambda x: x.split('_')[0], w2v_vectors.vocab.keys())), w2v_vectors.vocab.values()))

In [3]:
# morph analyzer for text lemmatization
morph = pymorphy2.MorphAnalyzer()


# function for performing parallel computing on cpu
def parallelization(func, massive, jobs=None, tq=True):
    num_cores = multiprocessing.cpu_count() if jobs is None else jobs
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results


def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]


def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def get_words(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in get_words(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words


def preprocess(texts, dump=True):
    preprocessed_texts = parallelization(text2canonicals, texts)
    vectorizer = TfidfVectorizer()
    
    texts = list(map(lambda x: ' '.join(x), preprocessed_texts))

    if dump:
        vectorizer = vectorizer.fit(texts)
        with open('pickles/vectorizer.pkl', 'wb') as f:
            pickle.dump(vectorizer, f)
    else:
        with open('pickles/vectorizer.pkl', 'rb') as f:
            vectorizer = pickle.load(f)
            
    tfifd_vectorized = vectorizer.transform(texts).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))

    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors

    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings

def preprocess_single_text(text):
    # embedding vectors weighted with tfidf
    preprocessed_text = text2canonicals(text)
    length = len(preprocessed_text) if len(preprocessed_text) > 0 else 1
    
    preprocessed_text = ' '.join(preprocessed_text)
    vectorizer = pickle.load(open('pickles/vectorizer.pkl', 'rb'))
    tfifd_vectorized = vectorizer.transform([preprocessed_text]).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))
    
    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors
    weighted_embeddings /= length
    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings

def get_text_vectors(text):
    cnt = 0
    matrix = np.zeros((len(text), 300))
    for i,word in enumerate(text):
        try:
            vector = w2v_vectors[word]
        except KeyError:
            cnt += 1
            vector = np.zeros((300,))
        matrix[i] = vector
    #print('cached {} exeptions'.format(cnt))
    return matrix

In [4]:
comments = (pd
            .read_pickle("data/new_data.pkl")
            .rename(columns={"message": "content"})
            .assign(local_datetime = lambda x: x["local_datetime"].str[:10])
            .astype({"local_datetime": "datetime64"})
            )
comments.head()

Unnamed: 0,sp_id,local_datetime,content,lemma
5,2,2018-04-06,giving you money back. thanks,"[giving, you, money, back, thanks]"
6,2,2018-04-06,giving you money back. thanks,"[giving, you, money, back, thanks]"
7,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]"
8,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]"
9,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]"


# Embeddings and clusters

In [5]:
vectors = preprocess(comments["content"].values, dump=True)
n_components = 50
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(vectors)
with open('pickles/kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
comments = comments.assign(cluster = kmeans.labels_)
comments.head(100)

100%|██████████| 33183/33183 [00:11<00:00, 2880.29it/s]


Unnamed: 0,sp_id,local_datetime,content,lemma,cluster
5,2,2018-04-06,giving you money back. thanks,"[giving, you, money, back, thanks]",43
6,2,2018-04-06,giving you money back. thanks,"[giving, you, money, back, thanks]",43
7,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]",43
8,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]",43
9,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]",43
10,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]",43
11,2,2018-04-06,for Ivan's birthday,"[for, ivan, birthday]",43
12,2,2018-04-10,test,[test],36
13,2,2018-04-12,test,[test],36
14,2,2018-04-12,test,[test],36


# Evaluation

## Random examples from clusters

In [6]:
for clusters in range(n_components):
    print('Cluster #', clusters)
    try:
        print(comments.query('cluster == @clusters').sample(3, random_state=0)[['content']])
    except ValueError:
        print(comments.query('cluster == @clusters').sample(3, random_state=0, replace=True)[['content']])
    print()

Cluster # 0
                        content
655202                     кофе
555216  Саня балабанов за пивас
435849               на тортики

Cluster # 1
                                   content
677699                              За Мед
491018  Миляуше Хамитовне Яна Хузягалеева 
630716                              Приват

Cluster # 2
                    content
663395         На  Сентября
556472  На  сентября + торт
659166         За Сентябрь 

Cluster # 3
       content
58541    такси
461611   такси
417221   Такси

Cluster # 4
                 content
543757           Спасибо
587576  Спасибо вз вз вз
406128        Спасибо вз

Cluster # 5
                           content
620107        Алексей Валентинович
407327  Цветная Людмила Алексеевна
417183        Алексей Анатольевич 

Cluster # 6
              content
407824           Долг
413810  Возврат Долга
663075  Отдаю долг ❤️

Cluster # 7
                   content
25174              перевод
683420             перевод
465483  Денежные

## Cluster remapping

In [7]:
clusters_names = {
    0: "Обед",
    1: "Спасибо",
    2: "Привет",
    3: "Перевод",
    4: "Покупка",
    5: "Мало",
    6: "Центральный",
    7: "Деньги",
    8: "Люблю",
    9: "Квартира",
    10: "Шаурма",
    11: "иммя",
    12: "иммя",
    13: "test",
    14: "иммя",
    15: "Такси",
    16: "Тест",
    17: "Проверка связи",
    18: "Штраф",
    19: "Стол",
    20: "иммя",
    21: "Костюм",
    22: "Долг",
    23: "Подарок",
    24: "Возврат",
    25: "иммя",
    26: "Себе",
    27: "Оплата",
    28: "Проба",
    29: "иммя",
    30: "Подарки",
    31: "иммя",
    32: "Лови",
    33: "Перевёл",
    34: "С Днём рождения",
    35: "Мама",
    36: "На счёт",
    37: "иммя",
    38: "Проезд",
    39: "Пиво",
    40: "Пришли?",
    41: "иммя",
    42: "Мойка",
    43: "Кредит",
    44: "иммя",
    45: "Проверка",
    46: "иммя",
    47: "Напиши если пришли",
    48: "иммя",
    49: "иммя"
}

In [8]:
# map_clusters = lambda x: 0 if clusters_names[x] == "иммя" else x
# comments["cluster_fixed"] = comments["cluster"].map(map_clusters)

# comments["cluster_name"] = comments["cluster"].map(clusters_names)

In [9]:
# comments.sample(100)

In [10]:
comments.to_pickle("pickles/comments.pkl")

# Inference

## Get cluster and random comments

In [None]:
comments.query

In [5]:

import multiprocessing
import pickle
import re
from functools import lru_cache
from time import time
import numpy as np
import pandas as pd
import pymorphy2
from gensim.models import KeyedVectors
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from gensim.test.utils import datapath
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.spatial.distance import cosine
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

np.random.seed(42)

# Start service with these models in memory
w2v_vectors = KeyedVectors.load_word2vec_format("185/model.bin", binary=True)
w2v_vectors.vocab = dict(zip(list(map(lambda x: x.split('_')[0], w2v_vectors.vocab.keys())), w2v_vectors.vocab.values()))
kmeans = pickle.load(open('pickles/kmeans.pkl', 'rb'))
comments = pickle.load(open('pickles/comments.pkl', 'rb'))
# morph analyzer for text lemmatization
morph = pymorphy2.MorphAnalyzer()


# function for performing parallel computing on cpu
def parallelization(func, massive, jobs=None, tq=True):
    num_cores = multiprocessing.cpu_count() if jobs is None else jobs
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results


def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]


def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def get_words(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 3, re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)


def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in get_words(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words


def preprocess(texts, dump=True):
    preprocessed_texts = parallelization(text2canonicals, texts)
    vectorizer = TfidfVectorizer()
    
    texts = list(map(lambda x: ' '.join(x), preprocessed_texts))

    if dump:
        vectorizer = vectorizer.fit(texts)
        with open('pickles/vectorizer.pkl', 'wb') as f:
            pickle.dump(vectorizer, f)
    else:
        with open('pickles/vectorizer.pkl', 'rb') as f:
            vectorizer = pickle.load(f)
            
    tfifd_vectorized = vectorizer.transform(texts).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))

    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors

    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings

def preprocess_single_text(text):
    # embedding vectors weighted with tfidf
    preprocessed_text = text2canonicals(text)
    length = len(preprocessed_text) if len(preprocessed_text) > 0 else 1
    
    preprocessed_text = ' '.join(preprocessed_text)
    vectorizer = pickle.load(open('pickles/vectorizer.pkl', 'rb'))
    tfifd_vectorized = vectorizer.transform([preprocessed_text]).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))
    
    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors
    weighted_embeddings /= length
    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings

def get_text_vectors(text):
    cnt = 0
    matrix = np.zeros((len(text), 300))
    for i,word in enumerate(text):
        try:
            vector = w2v_vectors[word]
        except KeyError:
            cnt += 1
            vector = np.zeros((300,))
        matrix[i] = vector
    #print('cached {} exeptions'.format(cnt))
    return matrix


def infer_cluster(text):
    vector = preprocess_single_text(text)
    cluster = kmeans.predict(vector)
    try:
        random_3 = comments.query('cluster == @cluster').sample(3, random_state=0)['content'].to_list()
    except ValueError:
        random_3 = comments.query('cluster == @cluster').sample(3, random_state=0, replace=True).to_list()
    return cluster[0], random_3


def get_top_clusters_overall(comments=comments):
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x/len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies


def get_top_clusters_month(comments=comments):
    comments = comments[(comments["local_datetime"].dt.month) == 7 & (comments["local_datetime"].dt.year == 2019)]
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x/len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies


def get_top_clusters_week(comments=comments):
    comments = comments[("2019-07-25" <= comments["local_datetime"]) & (comments["local_datetime"] <= "2019-07-31")]
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x / len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies

Overwriting cluster_analysis.py


In [3]:
from cluster_analysis import infer_cluster as infer

In [18]:
clust = 48
try:
    print(comments.query('cluster == @clust').sample(3, random_state=0)[['content']])
except ValueError:
    print(comments.query('cluster == @clust').sample(3, random_state=0, replace=True)[['content']])

                       content
53037    за квартиру Булякова 
551036             За Квартиру
616601                Квартира


In [3]:
def infer_cluster(text):
    vector = preprocess_single_text(text)
    cluster = kmeans.predict(vector)
    try:
        random_3 = comments.query('cluster == @cluster').sample(3, random_state=0)['content'].to_list()
    except ValueError:
        random_3 = comments.query('cluster == @cluster').sample(3, random_state=0, replace=True).to_list()
    return cluster[0], random_3

In [4]:
infer("За Квартиру плт")

(48, [' за квартиру Булякова ', 'За Квартиру', 'Квартира'])

## Cluster frequency lookup

In [24]:
comments["cluster"].value_counts()[:3].map(lambda x: str(round(x/len(comments) * 100, 1))+"%").to_dict()

{1: '19.3%', 15: '7.4%', 41: '6.6%'}

In [27]:
def get_top_clusters_overall(comments=comments):
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x/len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies

In [28]:
get_top_clusters_overall()

{1: '19.3%', 15: '7.4%', 41: '6.6%'}

In [39]:
def get_top_clusters_month(comments=comments):
    comments = comments[(comments["local_datetime"].dt.month) == 7 & (comments["local_datetime"].dt.year == 2019)]
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x/len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies

In [40]:
get_top_clusters_month()

{1: '23.2%', 41: '7.8%', 15: '6.9%'}

In [43]:
def get_top_clusters_week(comments=comments):
    comments = comments[("2019-07-25" <= comments["local_datetime"]) & (comments["local_datetime"] <= "2019-07-31")]
    cluster_frequencies = (comments["cluster"]
                           .value_counts()[:3]
                           .map(lambda x: str(round(x/len(comments) * 100, 1))+"%")
                           .to_dict()
                           )
    return cluster_frequencies

In [44]:
get_top_clusters_week()

{1: '20.3%', 15: '7.5%', 41: '7.5%'}