In [None]:
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Read chat to DF

In [None]:
data = pd.read_json('../Data/dva_dimona_vovonjaka.json')

In [None]:

allChats = json_normalize(data.chats[1])

In [None]:
chat = json_normalize(allChats.messages[0])
chat.info()

DF cleanup

In [None]:
chat = chat[chat['type']=='message']
chat = chat[['id', 'date', 'edited', 'text', 'from', 'media_type', 'sticker_emoji', 'reply_to_message_id']]
chat['date'] = chat['date'].apply(lambda x: pd.Timestamp(x))
chat['edited'] = chat['edited'].apply(lambda x: pd.Timestamp(x))
chat.replace(pd.Timestamp('1970-01-01T01:00:00'), pd.NA, inplace=True)
chat.info()

In [None]:
chat[['text', 'from', 'date']].head(20)

Statistics about message types

In [None]:
fig, axes = plt.subplots(ncols=4, figsize = (12,4))
plt.subplots_adjust(wspace=0.5, hspace=0.5)

chat['from'].value_counts().plot(kind='bar', title='Количество сообщений', ax=axes[0])
chat[chat.media_type == 'sticker']['from'].value_counts().plot(kind='bar', title='Количество стикеров', ax=axes[1])
chat[chat.text.apply(lambda x: not isinstance(x, str))]['from'].value_counts().plot(kind='bar', title='Количество ссылок', ax=axes[2])
chat[chat.media_type == 'video_file']['from'].value_counts().plot(kind='bar', title='Количество видео', ax=axes[3])

Extract data from non-text messages

In [None]:
msgs = np.array([])
mentions = np.array([])
links = np.array([])
for l in chat[chat.text.apply(lambda x: isinstance(x, list))].text.values:
    for item in l:
        if isinstance(item, str):
            msgs = np.append(item, msgs)
        if isinstance(item, dict):
            if item['type'] == 'link':
                links = np.append(item['text'], links)
            if item['type'] == 'mention':
                mentions = np.append(item['text'], mentions)
                # mentions.append(item['text'])

print('Messages: ', msgs)
print()
print('Mentions: ', mentions)
print()
print('Links: ', links)

### Define tokenixzation function

In [None]:
import nltk
from pymystem3 import Mystem
from string import punctuation

# nltk.download('stopwords')
# nltk.download('punkt')

mystem = Mystem() 
def tokenize(text:str, stopWords: []):
    tokens = mystem.lemmatize(text.lower())
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]

    for p in punctuation:
        if p != '-':
            tokens = [t.replace(p, " ") for t in tokens]

    return [token.strip() for token in tokens if token.strip() not in stopWords\
              and token.strip() != "" \
              and not token.isdigit()
              and token.strip() >= 'А'
              and token.strip() <='я'
              and token.strip() not in punctuation]

Define stop-words

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', '–', 'к', 'на', '...', '✔', '•', '’', 'че', 'ток', 'шо', 'тип'])

Difine function for getting user vocabulary

In [None]:
from collections import Counter

def getTokens(data, stopWords=[]):
    # tokenize raw text messages
    messages = data[data.apply(lambda x: isinstance(x, str))]
    tokens = tokenize(' '.join(messages), stopWords)

    # tokenize text from complex messages
    nonMessages = data[data.apply(lambda x: isinstance(x, list))]
    msgs = np.array([])
    for l in nonMessages.values:
        for item in l:
            if isinstance(item, str):
                msgs = np.append(item, msgs)
    tokens.extend(tokenize(' '.join(msgs), stopWords))
    return tokens

def getUserVocabulary(sender:str, stopWords=[]):
    allMsgs = chat[chat['from'] == sender].text
    tokens = getTokens(allMsgs, stopWords)
    return Counter(tokens)

### Get user most used words statistic 

In [None]:
fig, axes = plt.subplots(ncols=3, figsize = (15,4))
plt.subplots_adjust(wspace=0.5, hspace=0.5)

z_words, z_counts = zip(*getUserVocabulary('Dez Dezsson').most_common(20))
d_words, d_counts = zip(*getUserVocabulary('Дима Лацюга').most_common(20))
v_words, w_counts = zip(*getUserVocabulary('Вова Свинухов').most_common(20))

axes[0].set_title('Dez Dezsson')
axes[0].barh(z_words, z_counts)

axes[1].set_title('Дима Лацюга')
axes[1].barh(d_words, d_counts)

axes[2].set_title('Вова Свинухов')
axes[2].barh(v_words, w_counts)

In [None]:
fig, axes = plt.subplots(ncols=3, figsize = (15,4))
plt.subplots_adjust(wspace=0.5, hspace=0.5)

z_words, z_counts = zip(*getUserVocabulary('Dez Dezsson', stop_words).most_common(20))
d_words, d_counts = zip(*getUserVocabulary('Дима Лацюга', stop_words).most_common(20))
v_words, w_counts = zip(*getUserVocabulary('Вова Свинухов', stop_words).most_common(20))

axes[0].set_title('Dez Dezsson')
axes[0].barh(z_words, z_counts)

axes[1].set_title('Дима Лацюга')
axes[1].barh(d_words, d_counts)

axes[2].set_title('Вова Свинухов')
axes[2].barh(v_words, w_counts)

In [None]:
z_len=len(getUserVocabulary('Dez Dezsson', stop_words).keys())
d_len=len(getUserVocabulary('Дима Лацюга', stop_words).keys())
v_len=len(getUserVocabulary('Вова Свинухов', stop_words).keys())
plt.bar(['Dez Dezsson', 'Дима Лацюга', 'Вова Свинухов'], [z_len, d_len, v_len])
plt.title("Словарный запас")

In [None]:
z_count = len(chat[chat['from'] == 'Dez Dezsson'])
d_count = len(chat[chat['from'] == 'Дима Лацюга'])
v_count = len(chat[chat['from'] == 'Вова Свинухов'])

print(z_count, d_count, v_count)

plt.bar(['Dez Dezsson', 'Дима Лацюга', 'Вова Свинухов'], [z_len/z_count, d_len/d_count, v_len/v_count])
plt.title("Словарный запас vs колличество сообщений")

## Find n-gramms

In [None]:
def findNGramms(data, n=1,most_common=20,stop_words=[]):
    tokens = getTokens(data, stop_words)
    words=nltk.ngrams(tokens,n)
    words=nltk.FreqDist(words)
    print ('Колличество токенов: ',words.N())
    print ('Колличество уникальных токенов: ',words.B())
    return words

### N-граммы Димона

In [None]:
findNGramms(chat[chat['from'] == 'Дима Лацюга'].text, n=2, stop_words=stop_words).plot(20, title = 'Дима Лацюга 2-gramms')

In [None]:
findNGramms(chat[chat['from'] == 'Дима Лацюга'].text, n=3, stop_words=stop_words).plot(20, title = 'Дима Лацюга 3-gramms')

In [None]:
findNGramms(chat[chat['from'] == 'Дима Лацюга'].text, n=4, stop_words=stop_words).plot(20, title = 'Дима Лацюга 4-gramms')

### N-граммы Вовоняки

In [None]:
findNGramms(chat[chat['from'] == 'Вова Свинухов'].text, n=2, stop_words=stop_words).plot(20, title = 'Вова Свинухов 2-gramms')

In [None]:
findNGramms(chat[chat['from'] == 'Вова Свинухов'].text, n=3, stop_words=stop_words).plot(20, title = 'Вова Свинухов 3-gramms')

In [None]:
findNGramms(chat[chat['from'] == 'Вова Свинухов'].text, n=4, stop_words=stop_words).plot(20, title = 'Вова Свинухов 4-gramms')

### Мои N-граммы

In [None]:
findNGramms(chat[chat['from'] == 'Dez Dezsson'].text, n=2, stop_words=stop_words).plot(20, title = 'Dez Dezsson 2-gramms')


In [None]:
findNGramms(chat.text, n=2, stop_words=stop_words).plot(20, title = 'Наиболее частые 2-gramms')

In [None]:
import gensim
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

s = 'Умирает старый пчеловод, посвятивший всю жизнь разведению пчёл и уходу за ними. Лёжа на смертном одре, он подзывает к себе трёх своих сыновей и говорит: «Дети мои! Запомните – всё в этой жизни хуйня – всё кроме пчёл» Затем, закрыв глаза и помолчав немного, добавляет: «А впрочем, и пчёлы тоже хуйня».'


sentences = [tokenize(sent) for sent in sent_tokenize(s, 'russian')]
print(sentences) 
# model = gensim.models.Word2Vec(sentences, size=150, window=5, min_count=5, workers=4)
# model.save('./w2v.model')
# print('saved')

# tokens = tokenize(s)
# tokens