In [1]:
import pandas as pd
import json
import copy
import glob
import math
import re
import os

import ipywidgets as widgets
import altair as alt


from utils.text_data_transformation import url_to_domain, word_to_num, delete_special_characters
from utils.dialog_manipulation import add_subdialogs_ids, add_reply_time

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
DIALOGS_FOLDER = "data/dialogs/"
RESULT_FOLDER = "result/"
PREPARED_FOLDER = 'data/prepared_dialogs/'

In [4]:
if os.path.isdir(DIALOGS_FOLDER):
    if not os.path.isdir(PREPARED_FOLDER):
        os.mkdir(PREPARED_FOLDER)
        
    if not os.path.isdir(RESULT_FOLDER):
        os.mkdir(RESULT_FOLDER)

In [5]:
def prepare_messages(data):
    """
    Makes preparation for the given message.
    :param data: DataFrame
    :return: None
    """
    for i in data.index:
        out_msg = []
        for word in str(data.loc[i, 'message']).split():
            if url_to_domain(word, check=True):
                out_msg.append(url_to_domain(word))
            else:
                word = word_to_num(word)
                out_msg.append(delete_special_characters(word))
        data.loc[i, 'message'] = re.sub(r'\s\s+', ' ', ' '.join(out_msg))



def prepare_dialog(dialog_file: str, prep_folder: str) -> None:
    """
    Reads raw csv data and creates prepared copy
    """
    file_name = dialog_file.split('/')[-1]
    data = pd.read_csv(dialog_file)
    
    prepare_messages(data)

    add_reply_time(data)
    add_subdialogs_ids(data)
    
    data.to_csv(f'{prep_folder}{file_name}')

In [6]:
# for dialog in glob.glob(DIALOGS_FOLDER+'/*.csv'):
for dialog in ['data/dialogs/test.csv']:
    prepare_dialog(dialog, PREPARED_FOLDER)

KeyError
-1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['reply_time'][i] += time_diff
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['subdialog_id'][i] = subdialog_count


In [11]:
# Initialization

# Your dialogs must be under data/prepared_dialogs/
dialogs = glob.glob(PREPARED_FOLDER+'/*.csv')

basic_info = {}

user = {
    'msg_count': 1,
    'msg_len': 0,
    'total_word_num': 0,
    'reply_avg_time': [],
    'msg_per_hour': {x: 0 for x in range(24)},
    'reply_per_hour': {x: [] for x in range(24)},
    'reply_time_word_length': {}
}

In [13]:
# Extracting basic information from all dialogs

for dialog_csv_path in dialogs:
    data = pd.read_csv(dialog_csv_path)
    
    #TODO: FIX THIS, never transform file name in this way, we need id data
    dialog = re.findall(r'\d\d+', dialog_csv_path)[0]

    if not basic_info.get(dialog):
        basic_info.setdefault(dialog, {})

    # Gathering data for each user
    for i in data.index:
        sender = str(data['from_id'][i])
        msg_length = len(str(data['message'][i]))

        if not basic_info[dialog].get(sender):
            basic_info[dialog].setdefault(sender, copy.deepcopy(user))

        basic_info[dialog][sender]['msg_count'] += 1
        basic_info[dialog][sender]['msg_len'] += msg_length
        basic_info[dialog][sender]['msg_per_hour'][int(data['date'][i][11:13])] += 1
        basic_info[dialog][sender]['total_word_num'] += len(str(data['message'][i]).split())

        if reply_time := data['reply_time'][i]:
            basic_info[dialog][sender]['reply_avg_time'].append(reply_time)
            basic_info[dialog][sender]['reply_per_hour'][int(data['date'][i][11:13])].append(reply_time)

        reply_time = int(data['reply_time'][i])
        reply_frequency = basic_info[dialog][sender]['reply_time_word_length']
        if not reply_frequency.get(reply_time):
            reply_frequency.setdefault(reply_time, [msg_length])
        else:
            reply_frequency[reply_time].append(msg_length)

    # Calculate average values
    for sender, value in basic_info[dialog].items():
        basic_info[dialog][sender]['msg_len'] /= data['id'].count()
        reply_time = basic_info[dialog][sender]['reply_avg_time']
        basic_info[dialog][sender]['reply_avg_time'] = sum(reply_time) / len(reply_time)

        # Calculating avg reply_time per hour
        reply_per_hour = basic_info[dialog][sender]['reply_per_hour']
        for k, v in reply_per_hour.items():
            if reply_time := reply_per_hour[k]:
                reply_per_hour[k] = sum(reply_time) / len(reply_time)
            else:
                reply_per_hour[k] = -1

        # Calculating avg msg length per reply_time
        reply_time_word_length = basic_info[dialog][sender]['reply_time_word_length']
        for k, v in reply_time_word_length.items():
            reply_time_word_length[k] = sum(v) / len(v)

AttributeError: 'float' object has no attribute 'append'

In [None]:
# Dumping gathered data to data/basic_info.json

with open(f'{RESULT_FOLDER}dialogs_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(basic_info, f, ensure_ascii=False, indent=4)

In [None]:
# Selecting dialog id to plot data

dialogs = list(basic_info.keys())
dialog_id = widgets.Dropdown(
    options=dialogs,
    value=dialogs[0],
    description='Dialog:',
    disabled=False,
)
display(dialog_id)


In [None]:
# Selecting user id to plot data

users = list(basic_info[dialog_id.value].keys())
user_id = widgets.Dropdown(
    options=users,
    value=users[0],
    description='User:',
    disabled=False,
)
display(user_id)


In [None]:
# Reply time on y-axis, daytime hours on x-axis, radius of circles corespondes to the message number
# at that hour.

plotting_data = pd.DataFrame({'Hour': range(24),
                              'ReplyTime': list(basic_info[dialog_id.value][user_id.value]['reply_per_hour'].values()),
                              'MessageNumber': list(basic_info[dialog_id.value][user_id.value]['msg_per_hour'].values())}).sort_values(by='Hour')

alt.Chart(plotting_data).mark_circle(size=60).encode(
    x='Hour',
    y='ReplyTime',
    size='MessageNumber:Q',
    tooltip=['Hour', 'ReplyTime', 'MessageNumber']
).properties(
    width=800,
    height=300
)

In [None]:
# Plot: Reply time sec on x-axis, radius of mark_circle avg message length

# Create DataFrame for plotting
reply_time_word_length = basic_info[dialog_id.value][user_id.value]['reply_time_word_length']
plotting_data = pd.DataFrame({'ReplyTime': list(reply_time_word_length.keys()),
                              'MessageLength': list(reply_time_word_length.values())}).sort_values(by='ReplyTime')

alt.Chart(plotting_data).mark_circle().encode(
    alt.X('ReplyTime:Q',),
    size='MessageLength:Q'
).properties(
    width=800,
    height=300
)
