<a href="https://colab.research.google.com/github/dyatelok/tg-chat-stat/blob/main/chat_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install emoji

In [None]:
pip install names

In [None]:
import re
from collections import Counter
import emoji
import numpy
import names
import pandas as pd
from matplotlib import pyplot
from wordcloud import WordCloud, STOPWORDS
import json
from pandas import json_normalize

In [None]:
#open text file in read mode
text_file = open("wiki_logs.json", "r")

#read whole file to a string
data = text_file.read()

#close file
text_file.close()

dict = json.loads(data)
df = json_normalize(dict['messages'])

In [None]:
df = df[['text', 'from', 'media_type','type','id','date']]
df = df.dropna(subset = ['from'])

In [None]:
df[['type','from']].groupby(['from']).count().sort_values(['type'], ascending=False)

In [None]:
df[['media_type', 'id']].groupby('media_type', as_index=False).count()

In [None]:
voice_df = df.loc[df['media_type'] == 'voice_message'][['from', 'id']]\
    .groupby(['from'], as_index=False)\
    .agg('count')\
    .sort_values(['id'], ascending=False)

import plotly.express as px
fig = px.pie(voice_df, hole=.5, values=voice_df['id'], names=voice_df['from'],
             title='Voice messages per person')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()

sticker_df = df.loc[df['media_type'] == 'sticker'][['from', 'id']]\
    .groupby(['from'], as_index=False)\
    .agg('count')\
    .sort_values(['id'], ascending=False)

import plotly.express as px
fig = px.pie(sticker_df, hole=.5, values=sticker_df['id'], names=sticker_df['from'],
             title='Stickers sent')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()

In [None]:
def get_emojis_in_message(row):
    message = row.text
    emojis = ""
    # Telegram may save some messages as json
    if message is None or type(message) != str:
        return None
    return emojis.join(char for char in message if char in emoji.EMOJI_DATA)

def get_words_count(row):
    message = row.text
    emojis = ""
    # Telegram may save some messages as json
    if message is None or type(message) != str:
        return None
    return re.sub("[^\w]", " ",  message).split().__len__()

df["emojis"] = df[["text"]].apply(get_emojis_in_message, axis=1)
df["word_count"] = df[["text"]].apply(get_words_count, axis=1)

In [None]:
people = df['from'].unique()

for name in people:
    user_df = df[df["from"] == name]
    words_per_message = numpy.sum(user_df['word_count'])
    print('stats for ', name)
    print(name,' sent  ', int(words_per_message), ' words, average ', words_per_message/user_df.shape[0], ' per message')

In [None]:
total_emojis_list = list(df.emojis)

emoji_dict = {}
for i in total_emojis_list:
  emoji_dict[i] = emoji_dict.get(i, 0) + 1

emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.replace(to_replace='None', value=numpy.nan).dropna()
emoji_df.replace(to_replace=0, value=numpy.nan).dropna()

import plotly.express as px
fig = px.pie(emoji_df.loc[2:].head(60), hole=.5, values='count', names='emoji',
             title='Emoji Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
text_df = df.text.dropna()
text = " ".join(review for review in df.text.dropna() if review is not None and type(review) == str)
print ("There are {} words in all the messages.".format(len(text)))

stopwords = set(STOPWORDS)
stopwords.update(["Я","Ну","и","это","не","мне","но","А","ты","как","так","что","меня","то","нет","на","в","там","у","с","Да"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
# Display the generated image:
pyplot.figure( figsize=(10,5))
pyplot.imshow(wordcloud, interpolation='bilinear')
pyplot.axis("off")
pyplot.show()

In [None]:
df["datetime"] = pd.to_datetime(df['date'])
df.index = df['datetime']
date_df = df.resample("D").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="datetime", y="word_count", title='Number of words shared as time moves on.')
fig.update_xaxes(nticks=30)
fig.show()

In [None]:
df["word_count"].resample("D").sum().sort_values(ascending=False).head(10).plot.barh()

df["hour"] = df.datetime.dt.hour
df.groupby("hour")["word_count"].sum().head(24).plot.barh()

In [None]:

def dayofweek(i):
  l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
  return l[i]

day_df=pd.DataFrame(df["word_count"])
day_df['day_of_date'] = df['datetime'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(dayofweek)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)

fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.show()