In [1]:
import pandas as pd
import datetime as dt
import json
import copy
import glob
import re
import os

import gender_guesser.detector as gender

import ipywidgets as widgets
import altair as alt

from utils.text_data_transformation import url_to_domain, word_to_num, delete_special_characters, transform_raw_data
from utils.dialog_manipulation import add_subdialogs_ids, add_reply_time, detect_data_language, \
    get_user_step_msgs, if_name_in_dict

In [2]:
# Set up
DIALOGS_FOLDER = "data/new_type_dialogs/"
RESULT_FOLDER = "data/stats/"
PREPARED_FOLDER = 'data/new_type_dialogs_prepared/'
META_FOLDER = 'data/new_type_dialogs_meta2'
GENERAL_DIALOGS_DF = 'data/processed_dialog_files2/general_df.csv'

In [None]:
# Initialisation
# Note: Your dialogs must be under data/prepared_dialogs/

dialogs = glob.glob(PREPARED_FOLDER + '/*.csv')
AVERAGE_STATS = {'word_count', 'msg_len', 'reply_time'}
user_stats = {}
basic_data = {'msg_count': 1,
              'word_count': [],
              'msg_len': [],
              'reply_time': []}
user_data = {
    'basic_data': copy.deepcopy(basic_data),
    'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)},
    'weeks_data': {day: {'basic_data': copy.deepcopy(basic_data),
                         'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)}
                         }
                   for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                               'Friday', 'Saturday', 'Sunday']
                   }
}


def prepare_messages(data: pd.DataFrame) -> None:
    """
    Prepares each message in "message" column.
    """
    for index, row in data.iterrows():
        out_msg = []
        for word in str(row['message']).split():
            if url_to_domain(word, check=True):
                out_msg.append(url_to_domain(word))
            else:
                word = word_to_num(word)
                out_msg.append(delete_special_characters(word))
        data.loc[index, 'message'] = re.sub(r'\s\s+', ' ', ' '.join(out_msg))


def prepare_dialog(dialog_file: str, prep_folder: str) -> None:
    """
    Reads raw csv data and creates prepared copy
    """
    file_name = dialog_file.split('/')[-1]
    data = pd.read_csv(dialog_file)

    prepare_messages(data)
    add_reply_time(data)
    add_subdialogs_ids(data)

    data.to_csv(f'{prep_folder}{file_name}')


def get_stats(row: pd.DataFrame) -> dict:
    """
    Function to gather the information per row
    @ DataFrame.
    """
    stats = copy.deepcopy(basic_data)
    stats['msg_count'] = 1
    stats['word_count'].append(len(str(row['message']).split()))
    stats['msg_len'].append(len(str(row['message'])))
    if reply_time := row['reply_btw_sender_time']:
        stats['reply_time'].append(reply_time)
    return stats


def add_basic_stats(dialog_name: str, user_name: str, stats: dict) -> None:
    """
    Adds average statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    for key, value in stats.items():
        user_stats[dialog_name][user_name]['basic_data'][key] += value


def add_hour_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    hour = user_stats[dialog_name][user_name]['hours_data'][int(row['date'][11:13])]
    for key, value in stats.items():
        hour[key] += value


def add_week_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average week days hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    day_name = dt.datetime.strptime(row['date'][:10], '%Y-%m-%d').strftime("%A")
    hour_name = int(row['date'][11:13])
    day = user_stats[dialog_name][user_name]['weeks_data'][day_name]
    for key, value in stats.items():
        day['basic_data'][key] += value
        day['hours_data'][hour_name][key] += value


def calculate_average(dialog_name: str) -> None:
    """
    Crawls across dictionary looking for
    average data keys and then calculates the average.
    """
    dialog = user_stats[dialog_name]

    def crawl_for_average_values(dialog):
        for key, value in dialog.items():
            if isinstance(value, dict):
                crawl_for_average_values(value)
            elif key in AVERAGE_STATS and value:
                dialog[key] = sum(value) / len(value)
            elif key in AVERAGE_STATS:
                dialog[key] = -1

    crawl_for_average_values(dialog)

In [None]:
# Checking the path
if os.path.isdir(DIALOGS_FOLDER):
    if not os.path.isdir(PREPARED_FOLDER):
        os.mkdir(PREPARED_FOLDER)

    if not os.path.isdir(RESULT_FOLDER):
        os.mkdir(RESULT_FOLDER)

In [None]:
# Preparing raw data
for dialog in glob.glob(DIALOGS_FOLDER + '/*.csv'):
    prepare_dialog(dialog, PREPARED_FOLDER)
    print('Done preparing file: ', dialog)

In [None]:
# Extracting basic information from all dialogs
for dialog_csv_path in dialogs:
    data = pd.read_csv(dialog_csv_path)
    dialog = dialog_csv_path[len(PREPARED_FOLDER):-4]

    if not user_stats.get(dialog):
        user_stats.setdefault(dialog, {})

    # Iterating over each user @ dialog
    for index, row in data.iterrows():
        user = str(row['from_id'])
        if not user_stats[dialog].get(user):
            user_stats[dialog].setdefault(user, copy.deepcopy(user_data))

        # Gathering data
        stats = get_stats(row)
        add_basic_stats(dialog, user, stats)
        add_hour_stats(dialog, user, row, stats)
        add_week_stats(dialog, user, row, stats)

    # Calculating average values for this user
    calculate_average(dialog)
    print("Done extracting data from dialog #", dialog, sep='')

In [None]:
# Dumping gathered data to data/basic_info.json
with open(f'{RESULT_FOLDER}dialog_members_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(user_stats, f, ensure_ascii=False)

In [None]:
# Selecting dialog id to plot data
dialogs = list(user_stats.keys())
dialog_id = widgets.Dropdown(
    options=dialogs,
    value=dialogs[0],
    description='Dialog:',
    disabled=False,
)
display(dialog_id)

In [None]:
# Selecting user id to plot data
users = list(user_stats[dialog_id.value].keys())
user_id = widgets.Dropdown(
    options=users,
    value=users[0],
    description='User:',
    disabled=False,
)
display(user_id)

In [None]:
# Reply time on y-axis, daytime hours on x-axis, radius of circles corespondes to the message number
# at that hour.

plotting_data = pd. \
    DataFrame({'Hour': range(24),
               'ReplyTime': [x['reply_time'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()],
               'MessageNumber': [x['msg_count'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()]
               }).sort_values(by='Hour')

alt.Chart(plotting_data).mark_circle(size=60) \
    .encode(x='Hour',
            y='ReplyTime',
            size='MessageNumber:Q',
            tooltip=['Hour', 'ReplyTime', 'MessageNumber']
            ).properties(width=800, height=300)

In [3]:
def get_user_gender(username, name_dicts):
    user_gender = ''

    first_name = username.strip().split()[0].lower()
    print('first_name', first_name)

    new_df = pd.DataFrame({'message': [first_name]})
    word_lang = detect_data_language(new_df, 'one_word')

    for num_df, names_df in enumerate(name_dicts[word_lang]):
        if not names_df.loc[names_df['name'].str.lower() == first_name].empty or\
                if_name_in_dict(first_name, names_df):
            if num_df == 0:
                user_gender = 'female'
            else:
                user_gender = 'male'
            print(f"{first_name} gender is {user_gender}\n")
            break

    if user_gender == '' and word_lang == 'en':
        gender_detector = gender.Detector()
        user_gender = gender_detector.get_gender(first_name.capitalize())
        if user_gender == 'unknown':
            user_gender = ''
        print(f"{first_name} gender is {user_gender}\n")

    return user_gender


In [4]:
import pymorphy2
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

morph = pymorphy2.MorphAnalyzer()


def get_gender_by_verb(user_id, dialog_id, user_general_df):
    lang = detect_data_language('', 'df_loc', GENERAL_DIALOGS_DF, dialog_id, user_id)

    user_gender = ''
    if lang == 'en':
        return user_gender

    female_gender, male_gender = 0, 0
    dialog_step_msgs = get_user_step_msgs(GENERAL_DIALOGS_DF, dialog_id, user_id, 100, user_general_df)

    for msg in dialog_step_msgs:
        msg = transform_raw_data(msg, lang, '', '', 'without_lemma')
        for word in msg.split():
            word = word.strip()
            print(word)
            if len(word) <= 2:
                continue

            if lang == 'ua':
                if word[-2:] == 'ла' and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    female_gender += 1

                elif word[-1] in ('в','к','с')  and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    male_gender += 1

            elif lang == 'ru':
                segmenter = Segmenter()
                doc = Doc(word)
                emb = NewsEmbedding()
                morph_tagger = NewsMorphTagger(emb)

                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)

                if word[-2:] == 'ла' and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    female_gender += 1


                elif word[-1] in ('л','к','с') and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    male_gender += 1

    if male_gender > female_gender:
        user_gender = 'male'

    elif male_gender < female_gender:
        user_gender = 'female'

    else:
        user_gender = ''

    return user_gender

In [None]:
!pip install -U pymorphy2-dicts-uk
!pip install -U pymorphy2-dicts-ru

In [9]:
female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

female_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'female_ru_ukr_trans_names.csv'))
male_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'male_ru_ukr_trans_names.csv'))

members_statistics_df = pd.read_csv('data/members_statistics.csv')
# for filename in ['620593584.csv']:
# for filename in os.listdir(PREPARED_FOLDER):
members_statistics_df['first_name'], members_statistics_df['last_name'],\
members_statistics_df['username'], members_statistics_df['gender'] = '', '', '', ''
for index, row in members_statistics_df.iterrows():
    user_id = row.user_id
    # print("\n\n\nfilename", filename)
    # if not filename[1].isdigit():
    #     continue

    general_dialogs_df = pd.read_csv(GENERAL_DIALOGS_DF)
    user_general_df = general_dialogs_df.loc[general_dialogs_df['from_id'] == user_id]
    # dialog_data = pd.read_csv(os.path.join(PREPARED_FOLDER, filename))
    # if 'channel' in str(dialog_data['to_id'][0]):
    #     continue

    dialog_id = user_general_df['dialog ID'][user_general_df.index[0]]
    dialog_id = str(dialog_id)
    # dialog_id = filename[:-4]
    try:
        with open(os.path.join(META_FOLDER, dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'{dialog_id} not found in {META_FOLDER}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "en": [female_ru_ukr_trans_names, male_ru_ukr_trans_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['user_id'] == user_id:
            if user['first_name'] is not None:
                members_statistics_df.set_value(index, 'first_name', user['first_name'])
                user_gender = get_user_gender(user['first_name'], name_dicts)

            if user['last_name'] is not None:
                members_statistics_df.set_value(index, 'last_name', user['last_name'])
                if user_gender == '':
                    user_gender = get_user_gender(user['last_name'], name_dicts)

            if user['username'] is not None:
                members_statistics_df.set_value(index, 'username', user['username'])
                if user_gender == '':
                    if user['username'][-3:] == 'bot':
                        print(f'{user["username"]} is bot and we do not analyse it to get gender')
                        continue

                    user_gender = get_gender_by_verb(user['user_id'], int(dialog_id), user_general_df)
                    print(f'Final gender is {user_gender}')

            if user_gender != '':
                members_statistics_df.set_value(index, 'gender', user_gender)

            break

        # if user['first_name'] is not None:
        #     user_gender = get_user_gender(user['first_name'], name_dicts)
        #
        # if user['last_name'] is not None and user_gender == '':
        #     user_gender = get_user_gender(user['last_name'], name_dicts)
        #
        # if user_gender == '':
        #     if user['username'] is not None:
        #         if user['username'][-3:] == 'bot':
        #             print(f'{user["username"]} is bot and we do not analyse it to get gender')
        #             continue
        #
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')
        #
        # if user_gender == '':
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')

cols = ['user_id', 'first_name', "last_name", "username", "gender"]

rest_cols = [col for col in members_statistics_df.columns if col not in cols]

cols = cols + rest_cols
members_statistics_df = members_statistics_df[cols]
members_statistics_df



first_name назар
назар gender is male

first_name діма
діма gender is male

first_name solomiya
solomiya gender is female



Unnamed: 0,user_id,first_name,last_name,username,gender,weekday,sleep_bounds,my_dialog_active_minutes_in_date_range,friend_dialog_active_minutes_in_date_range
0,347963763,Назар Поночевний,,NazarPonochevnyi,male,Monday,23:30-07:30,482,450
1,138918380,Діма,Лопушанський,dmytrolopushanskyy,male,Monday,22:00-07:30,0,0
2,386414449,Solomiya,Lenio,sol4ik,female,Saturday,22:00-07:30,0,0


In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

ukr_words = ['зробив', 'робив', 'бачив',
             'сказав', 'розклав',
             'був', 'чув',
             'приніс', 'доніс',
             'з\'їв', 'всунув', 'розм\'як']

ru_words = ['сделал', 'делал',
            'видел',
             'сказал', 'расклал',
             'был', 'слышал',
             'принес', 'донес',
             'съел', 'всунул', 'размяк']

for word in ukr_words:
    print('\n' + word)
    print(morph.tag(word)[0].POS)
    print(morph.tag(word)[0].gender)


In [None]:
female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

female_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'female_ru_ukr_trans_names.csv'))
male_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'male_ru_ukr_trans_names.csv'))

# for filename in ['620593584.csv']:
for filename in os.listdir(PREPARED_FOLDER):
    print("\n\n\nfilename", filename)
    if not filename[1].isdigit():
        continue

    dialog_data = pd.read_csv(os.path.join(PREPARED_FOLDER, filename))
    if 'channel' in str(dialog_data['to_id'][0]):
        continue

    dialog_id = filename[:-4]
    try:
        with open(os.path.join(META_FOLDER, dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'{dialog_id} not found in {META_FOLDER}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "en": [female_ru_ukr_trans_names, male_ru_ukr_trans_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['first_name'] is not None:
            user_gender = get_user_gender(user['first_name'], name_dicts)

        if user['last_name'] is not None and user_gender == '':
            user_gender = get_user_gender(user['last_name'], name_dicts)

        if user_gender == '':
            if user['username'] is not None:
                if user['username'][-3:] == 'bot':
                    print(f'{user["username"]} is bot and we do not analyse it to get gender')
                    continue

            user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
            print(f'Final gender is {user_gender}')
        #
        # if user_gender == '':
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')


In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

ukr_words = ['зробив', 'робив', 'бачив',
             'сказав', 'розклав',
             'був', 'чув',
             'приніс', 'доніс',
             'з\'їв', 'всунув', 'розм\'як']

ru_words = ['сделал', 'делал',
            'видел',
             'сказал', 'расклал',
             'был', 'слышал',
             'принес', 'донес',
             'съел', 'всунул', 'размяк']

for word in ukr_words:
    print('\n' + word)
    print(morph.tag(word)[0].POS)
    print(morph.tag(word)[0].gender)
