In [None]:
import pandas as pd
import datetime as dt
import json
import copy
import glob
import re
import os

import ipywidgets as widgets
import altair as alt

from utils.text_data_transformation import url_to_domain, word_to_num, delete_special_characters
from utils.dialog_manipulation import add_subdialogs_ids, add_reply_time

In [None]:
# Set up
DIALOGS_FOLDER = "data/dialogs/"
RESULT_FOLDER = "data/stats/"
PREPARED_FOLDER = 'data/prepared_dialogs/'

In [None]:
# Initialisation
# Note: Your dialogs must be under data/prepared_dialogs/

dialogs = glob.glob(PREPARED_FOLDER + '/*.csv')
AVERAGE_STATS = {'word_count', 'msg_len', 'reply_time'}
user_stats = {}
basic_data = {'msg_count': 1,
              'word_count': [],
              'msg_len': [],
              'reply_time': []}
user_data = {
    'basic_data': copy.deepcopy(basic_data),
    'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)},
    'weeks_data': {day: {'basic_data': copy.deepcopy(basic_data),
                         'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)}
                         }
                   for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                               'Friday', 'Saturday', 'Sunday']
                   }
}


def prepare_messages(data: pd.DataFrame) -> None:
    """
    Prepares each message in "message" column.
    """
    for index, row in data.iterrows():
        out_msg = []
        for word in str(row['message']).split():
            if url_to_domain(word, check=True):
                out_msg.append(url_to_domain(word))
            else:
                word = word_to_num(word)
                out_msg.append(delete_special_characters(word))
        data.loc[index, 'message'] = re.sub(r'\s\s+', ' ', ' '.join(out_msg))


def prepare_dialog(dialog_file: str, prep_folder: str) -> None:
    """
    Reads raw csv data and creates prepared copy
    """
    file_name = dialog_file.split('/')[-1]
    data = pd.read_csv(dialog_file)

    prepare_messages(data)
    add_reply_time(data)
    add_subdialogs_ids(data)

    data.to_csv(f'{prep_folder}{file_name}')


def get_stats(row: pd.DataFrame) -> dict:
    """
    Function to gather the information per row
    @ DataFrame.
    """
    stats = copy.deepcopy(basic_data)
    stats['msg_count'] = 1
    stats['word_count'].append(len(str(row['message']).split()))
    stats['msg_len'].append(len(str(row['message'])))
    if reply_time := row['reply_btw_sender_time']:
        stats['reply_time'].append(reply_time)
    return stats


def add_basic_stats(dialog_name: str, user_name: str, stats: dict) -> None:
    """
    Adds average statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    for key, value in stats.items():
        user_stats[dialog_name][user_name]['basic_data'][key] += value


def add_hour_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    hour = user_stats[dialog_name][user_name]['hours_data'][int(row['date'][11:13])]
    for key, value in stats.items():
        hour[key] += value


def add_week_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average week days hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    day_name = dt.datetime.strptime(row['date'][:10], '%Y-%m-%d').strftime("%A")
    hour_name = int(row['date'][11:13])
    day = user_stats[dialog_name][user_name]['weeks_data'][day_name]
    for key, value in stats.items():
        day['basic_data'][key] += value
        day['hours_data'][hour_name][key] += value


def calculate_average(dialog_name: str) -> None:
    """
    Crawls across dictionary looking for
    average data keys and then calculates the average.
    """
    dialog = user_stats[dialog_name]

    def crawl_for_average_values(dialog):
        for key, value in dialog.items():
            if isinstance(value, dict):
                crawl_for_average_values(value)
            elif key in AVERAGE_STATS and value:
                dialog[key] = sum(value) / len(value)
            elif key in AVERAGE_STATS:
                dialog[key] = -1

    crawl_for_average_values(dialog)

In [None]:
# Checking the path
if os.path.isdir(DIALOGS_FOLDER):
    if not os.path.isdir(PREPARED_FOLDER):
        os.mkdir(PREPARED_FOLDER)

    if not os.path.isdir(RESULT_FOLDER):
        os.mkdir(RESULT_FOLDER)

In [None]:
# Preparing raw data
for dialog in glob.glob(DIALOGS_FOLDER + '/*.csv'):
    prepare_dialog(dialog, PREPARED_FOLDER)
    print('Done preparing file: ', dialog)

In [None]:
# Extracting basic information from all dialogs
for dialog_csv_path in dialogs:
    data = pd.read_csv(dialog_csv_path)
    dialog = dialog_csv_path[len(PREPARED_FOLDER):-4]

    if not user_stats.get(dialog):
        user_stats.setdefault(dialog, {})

    # Iterating over each user @ dialog
    for index, row in data.iterrows():
        user = str(row['from_id'])
        if not user_stats[dialog].get(user):
            user_stats[dialog].setdefault(user, copy.deepcopy(user_data))

        # Gathering data
        stats = get_stats(row)
        add_basic_stats(dialog, user, stats)
        add_hour_stats(dialog, user, row, stats)
        add_week_stats(dialog, user, row, stats)

    # Calculating average values for this user
    calculate_average(dialog)
    print("Done extracting data from dialog #", dialog, sep='')

In [None]:
# Dumping gathered data to data/basic_info.json
with open(f'{RESULT_FOLDER}dialog_members_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(user_stats, f, ensure_ascii=False)

In [None]:
# Selecting dialog id to plot data
dialogs = list(user_stats.keys())
dialog_id = widgets.Dropdown(
    options=dialogs,
    value=dialogs[0],
    description='Dialog:',
    disabled=False,
)
display(dialog_id)

In [None]:
# Selecting user id to plot data
users = list(user_stats[dialog_id.value].keys())
user_id = widgets.Dropdown(
    options=users,
    value=users[0],
    description='User:',
    disabled=False,
)
display(user_id)

In [None]:
# Reply time on y-axis, daytime hours on x-axis, radius of circles corespondes to the message number
# at that hour.

plotting_data = pd. \
    DataFrame({'Hour': range(24),
               'ReplyTime': [x['reply_time'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()],
               'MessageNumber': [x['msg_count'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()]
               }).sort_values(by='Hour')

alt.Chart(plotting_data).mark_circle(size=60) \
    .encode(x='Hour',
            y='ReplyTime',
            size='MessageNumber:Q',
            tooltip=['Hour', 'ReplyTime', 'MessageNumber']
            ).properties(width=800, height=300)