In [3]:
import ipywidgets as widgets
import altair as alt
import pandas as pd
import json
import copy
import glob
import math
import re

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
# just to show files structure, we should keep it same

! tree . --filelimit 10

[01;34m.[00m
├── 0_make_initial_preparation.py
├── README.md
├── [01;34mdata[00m
│   ├── [01;34mdialogs[00m [417 entries exceeds filelimit, not opening dir]
│   └── [01;34mdialogs_list[00m [417 entries exceeds filelimit, not opening dir]
├── messages_per_hours.ipynb
├── requirements.txt
├── [01;34mtelegram-data-collection[00m
│   ├── 0_download_dialogs_list.py
│   ├── 1_download_dialogs_data.py
│   ├── README.md
│   ├── [01;34mconfig[00m
│   │   ├── config.json
│   │   └── config_example.json
│   ├── requirements.txt
│   ├── tmp.session
│   └── [01;34mutils[00m
│       ├── [01;34m__pycache__[00m
│       │   └── utils.cpython-37.pyc
│       └── utils.py
└── [01;34mutils[00m
    └── data_transformation.py

8 directories, 14 files


## REVIEW:
### 1. pls move data preparetion with related comments in this file
### 2. save all results (example: basic_info.json) in the related "result" subfolders
### 3. basic_info.json should be renamed to the "dialogs_statistics.json". this name should be variable
### 4. move all global/config variables in the section below

In [6]:
DIALOGS_FOLDER = "data/dialogs"
RESULT_FOLDER = "result"

In [7]:
# Initialization

# Your dialogs must be under data/prepared_dialogs/
dialogs = glob.glob(DIALOGS_FOLDER+'/*.csv')

basic_info = {}

user = {
    'msg_count': 1,
    'msg_len': 0,
    'total_word_num': 0,
    'reply_avg_time': [],
    'msg_per_hour': {x: 0 for x in range(24)},
    'reply_per_hour': {x: [] for x in range(24)},
    'reply_time_word_length': {}
}

In [8]:
# Extracting basic information from all dialogs

for dialog_csv_path in dialogs:
    data = pd.read_csv(dialog_csv_path)
    dialog = re.findall(r'\d\d+', dialog_csv_path)[0]

    if not basic_info.get(dialog):
        basic_info.setdefault(dialog, {})

    # Gathering data for each user
    for i in data.index:
        sender = str(data['from_id'][i])
        msg_length = len(str(data['message'][i]))

        if not basic_info[dialog].get(sender):
            basic_info[dialog].setdefault(sender, copy.deepcopy(user))

        basic_info[dialog][sender]['msg_count'] += 1
        basic_info[dialog][sender]['msg_len'] += msg_length
        basic_info[dialog][sender]['msg_per_hour'][int(data['date'][i][11:13])] += 1
        basic_info[dialog][sender]['total_word_num'] += len(str(data['message'][i]).split())

        if reply_time := data['reply_time'][i]:
            basic_info[dialog][sender]['reply_avg_time'].append(reply_time)
            basic_info[dialog][sender]['reply_per_hour'][int(data['date'][i][11:13])].append(reply_time)

        reply_time = int(data['reply_time'][i])
        reply_frequency = basic_info[dialog][sender]['reply_time_word_length']
        if not reply_frequency.get(reply_time):
            reply_frequency.setdefault(reply_time, [msg_length])
        else:
            reply_frequency[reply_time].append(msg_length)

    # Calculate average values
    for sender, value in basic_info[dialog].items():
        basic_info[dialog][sender]['msg_len'] /= data['id'].count()
        reply_time = basic_info[dialog][sender]['reply_avg_time']
        basic_info[dialog][sender]['reply_avg_time'] = sum(reply_time) / len(reply_time)

        # Calculating avg reply_time per hour
        reply_per_hour = basic_info[dialog][sender]['reply_per_hour']
        for k, v in reply_per_hour.items():
            if reply_time := reply_per_hour[k]:
                reply_per_hour[k] = sum(reply_time) / len(reply_time)
            else:
                reply_per_hour[k] = -1

        # Calculating avg msg length per reply_time
        reply_time_word_length = basic_info[dialog][sender]['reply_time_word_length']
        for k, v in reply_time_word_length.items():
            reply_time_word_length[k] = sum(v) / len(v)

KeyError: 'reply_time'

In [4]:
# Dumping gathered data to data/basic_info.json

with open('data/basic_info.json', 'w', encoding='utf-8') as f:
    json.dump(basic_info, f, ensure_ascii=False, indent=4)

In [5]:
# Selecting dialog id to plot data

dialogs = list(basic_info.keys())
dialog_id = widgets.Dropdown(
    options=dialogs,
    value=dialogs[0],
    description='Dialog:',
    disabled=False,
)
display(dialog_id)


IndexError: list index out of range

In [None]:
# Selecting user id to plot data

users = list(basic_info[dialog_id.value].keys())
user_id = widgets.Dropdown(
    options=users,
    value=users[0],
    description='User:',
    disabled=False,
)
display(user_id)


In [None]:
# Reply time on y-axis, daytime hours on x-axis, radius of circles corespondes to the message number
# at that hour.

plotting_data = pd.DataFrame({'Hour': range(24),
                              'ReplyTime': list(basic_info[dialog_id.value][user_id.value]['reply_per_hour'].values()),
                              'MessageNumber': list(basic_info[dialog_id.value][user_id.value]['msg_per_hour'].values())}).sort_values(by='Hour')

alt.Chart(plotting_data).mark_circle(size=60).encode(
    x='Hour',
    y='ReplyTime',
    size='MessageNumber:Q',
    tooltip=['Hour', 'ReplyTime', 'MessageNumber']
).properties(
    width=800,
    height=300
)

In [None]:
# Plot: Reply time sec on x-axis, radius of mark_circle avg message length

# Create DataFrame for plotting
reply_time_word_length = basic_info[dialog_id.value][user_id.value]['reply_time_word_length']
plotting_data = pd.DataFrame({'ReplyTime': list(reply_time_word_length.keys()),
                              'MessageLength': list(reply_time_word_length.values())}).sort_values(by='ReplyTime')

alt.Chart(plotting_data).mark_circle().encode(
    alt.X('ReplyTime:Q',),
    size='MessageLength:Q'
).properties(
    width=800,
    height=300
)
