In [1]:
import pandas as pd
import glob
import os
import logging

import gender_guesser.detector as gender

import ipywidgets as widgets
import altair as alt

from utils.text_data_transformation import url_to_domain, word_to_num, delete_special_characters, transform_raw_data
from utils.dialog_manipulation import add_subdialogs_ids, add_reply_time, detect_data_language, \
    get_user_step_msgs, if_name_in_dict

In [2]:
# Set up
DIALOGS_FOLDER = "data/new_type_dialogs/"
RESULT_FOLDER = "data/stats/"
PREPARED_FOLDER = 'data/new_type_dialogs_prepared/'
META_FOLDER = 'data/new_type_dialogs_meta2'
GENERAL_DIALOGS_DF = 'data/processed_dialog_files2/general_df.csv'

In [None]:
# Initialisation
# Note: Your dialogs must be under data/prepared_dialogs/

dialogs = glob.glob(PREPARED_FOLDER + '/*.csv')
AVERAGE_STATS = {'word_count', 'msg_len', 'reply_time'}
user_stats = {}
basic_data = {'msg_count': 1,
              'word_count': [],
              'msg_len': [],
              'reply_time': []}
user_data = {
    'basic_data': copy.deepcopy(basic_data),
    'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)},
    'weeks_data': {day: {'basic_data': copy.deepcopy(basic_data),
                         'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)}
                         }
                   for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                               'Friday', 'Saturday', 'Sunday']
                   }
}


def prepare_messages(data: pd.DataFrame) -> None:
    """
    Prepares each message in "message" column.
    """
    for index, row in data.iterrows():
        out_msg = []
        for word in str(row['message']).split():
            if url_to_domain(word, check=True):
                out_msg.append(url_to_domain(word))
            else:
                word = word_to_num(word)
                out_msg.append(delete_special_characters(word))
        data.loc[index, 'message'] = re.sub(r'\s\s+', ' ', ' '.join(out_msg))


def prepare_dialog(dialog_file: str, prep_folder: str) -> None:
    """
    Reads raw csv data and creates prepared copy
    """
    file_name = dialog_file.split('/')[-1]
    data = pd.read_csv(dialog_file)

    prepare_messages(data)
    add_reply_time(data)
    add_subdialogs_ids(data)

    data.to_csv(f'{prep_folder}{file_name}')


def get_stats(row: pd.DataFrame) -> dict:
    """
    Function to gather the information per row
    @ DataFrame.
    """
    stats = copy.deepcopy(basic_data)
    stats['msg_count'] = 1
    stats['word_count'].append(len(str(row['message']).split()))
    stats['msg_len'].append(len(str(row['message'])))
    if reply_time := row['reply_btw_sender_time']:
        stats['reply_time'].append(reply_time)
    return stats

DATAFRAME_PATH = "data/processed_dialog_files/general_dialogs_sentiment.csv"
USER_DATAFRAME_PATH = 'data/processed_dialog_files/user_stats.csv'
pd.set_option('display.max_rows', None)

In [None]:
# Checking the path

if not os.path.isfile(DATAFRAME_PATH):
    logging.error(f'No Dataframe associated with {DATAFRAME_PATH}')
else:
    df = pd.read_csv(DATAFRAME_PATH)
    df = df.rename(columns={'dialog ID': 'dialog_id'})

In [None]:
# Initialisation

def add_sleep_data(data: pd.DataFrame, user_df_path, save=True):
    """
    Add sleep data for each message in a dialog,
    and add it in a new column for a particular user,
    return new dataframe
    """
    gdf = pd.DataFrame(add_sleep_bounds(data))
    if save:
        gdf.to_csv(user_df_path, index=False)

def add_stats_data(data: pd.DataFrame, df_path, save=True):
    """
    Add mean data for each subdialogs in a dialog,
    and add it in a new column for a particular stats,
    return new dataframe
    """
    adf = add_subdialogs_stats(data)
    data['words_num_mean'] = adf['words_num_mean']
    data['reply_time_mean'] = adf['reply_time_mean']
    data['message_number_mean'] = adf['message_number_mean']
    if save:
        data.to_csv(df_path, index=False)

In [None]:
# Aggregating data
add_stats_data(df, DATAFRAME_PATH)

In [None]:
# User stats
add_sleep_data(df, USER_DATAFRAME_PATH)

In [3]:
def get_user_gender(username, name_dicts):
    user_gender = ''

    first_name = username.strip().split()[0].lower()
    print('first_name', first_name)

    new_df = pd.DataFrame({'message': [first_name]})
    word_lang = detect_data_language(new_df, 'one_word')

    for num_df, names_df in enumerate(name_dicts[word_lang]):
        if not names_df.loc[names_df['name'].str.lower() == first_name].empty or\
                if_name_in_dict(first_name, names_df):
            if num_df == 0:
                user_gender = 'female'
            else:
                user_gender = 'male'
            print(f"{first_name} gender is {user_gender}\n")
            break

    if user_gender == '' and word_lang == 'en':
        gender_detector = gender.Detector()
        user_gender = gender_detector.get_gender(first_name.capitalize())
        if user_gender == 'unknown':
            user_gender = ''
        print(f"{first_name} gender is {user_gender}\n")

    return user_gender


In [4]:
import pymorphy2
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

morph = pymorphy2.MorphAnalyzer()


def get_gender_by_verb(user_id, dialog_id, user_general_df):
    lang = detect_data_language('', 'df_loc', GENERAL_DIALOGS_DF, dialog_id, user_id)

    user_gender = ''
    if lang == 'en':
        return user_gender

    female_gender, male_gender = 0, 0
    dialog_step_msgs = get_user_step_msgs(GENERAL_DIALOGS_DF, dialog_id, user_id, 100, user_general_df)

    for msg in dialog_step_msgs:
        msg = transform_raw_data(msg, lang, '', '', 'without_lemma')
        for word in msg.split():
            word = word.strip()
            print(word)
            if len(word) <= 2:
                continue

            if lang == 'ua':
                if word[-2:] == 'ла' and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    female_gender += 1

                elif word[-1] in ('в','к','с')  and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    male_gender += 1

            elif lang == 'ru':
                segmenter = Segmenter()
                doc = Doc(word)
                emb = NewsEmbedding()
                morph_tagger = NewsMorphTagger(emb)

                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)

                if word[-2:] == 'ла' and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    female_gender += 1


                elif word[-1] in ('л','к','с') and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    male_gender += 1

    if male_gender > female_gender:
        user_gender = 'male'

    elif male_gender < female_gender:
        user_gender = 'female'

    else:
        user_gender = ''

    return user_gender

In [None]:
!pip install -U pymorphy2-dicts-uk
!pip install -U pymorphy2-dicts-ru

In [9]:
female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

female_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'female_ru_ukr_trans_names.csv'))
male_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'male_ru_ukr_trans_names.csv'))

members_statistics_df = pd.read_csv('data/members_statistics.csv')
# for filename in ['620593584.csv']:
# for filename in os.listdir(PREPARED_FOLDER):
members_statistics_df['first_name'], members_statistics_df['last_name'],\
members_statistics_df['username'], members_statistics_df['gender'] = '', '', '', ''
for index, row in members_statistics_df.iterrows():
    user_id = row.user_id
    # print("\n\n\nfilename", filename)
    # if not filename[1].isdigit():
    #     continue

    general_dialogs_df = pd.read_csv(GENERAL_DIALOGS_DF)
    user_general_df = general_dialogs_df.loc[general_dialogs_df['from_id'] == user_id]
    # dialog_data = pd.read_csv(os.path.join(PREPARED_FOLDER, filename))
    # if 'channel' in str(dialog_data['to_id'][0]):
    #     continue

    dialog_id = user_general_df['dialog ID'][user_general_df.index[0]]
    dialog_id = str(dialog_id)
    # dialog_id = filename[:-4]
    try:
        with open(os.path.join(META_FOLDER, dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'{dialog_id} not found in {META_FOLDER}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "en": [female_ru_ukr_trans_names, male_ru_ukr_trans_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['user_id'] == user_id:
            if user['first_name'] is not None:
                members_statistics_df.set_value(index, 'first_name', user['first_name'])
                user_gender = get_user_gender(user['first_name'], name_dicts)

            if user['last_name'] is not None:
                members_statistics_df.set_value(index, 'last_name', user['last_name'])
                if user_gender == '':
                    user_gender = get_user_gender(user['last_name'], name_dicts)

            if user['username'] is not None:
                members_statistics_df.set_value(index, 'username', user['username'])
                if user_gender == '':
                    if user['username'][-3:] == 'bot':
                        print(f'{user["username"]} is bot and we do not analyse it to get gender')
                        continue

                    user_gender = get_gender_by_verb(user['user_id'], int(dialog_id), user_general_df)
                    print(f'Final gender is {user_gender}')

            if user_gender != '':
                members_statistics_df.set_value(index, 'gender', user_gender)

            break

        # if user['first_name'] is not None:
        #     user_gender = get_user_gender(user['first_name'], name_dicts)
        #
        # if user['last_name'] is not None and user_gender == '':
        #     user_gender = get_user_gender(user['last_name'], name_dicts)
        #
        # if user_gender == '':
        #     if user['username'] is not None:
        #         if user['username'][-3:] == 'bot':
        #             print(f'{user["username"]} is bot and we do not analyse it to get gender')
        #             continue
        #
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')
        #
        # if user_gender == '':
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')

cols = ['user_id', 'first_name', "last_name", "username", "gender"]

rest_cols = [col for col in members_statistics_df.columns if col not in cols]

cols = cols + rest_cols
members_statistics_df = members_statistics_df[cols]
members_statistics_df



first_name назар
назар gender is male

first_name діма
діма gender is male

first_name solomiya
solomiya gender is female



Unnamed: 0,user_id,first_name,last_name,username,gender,weekday,sleep_bounds,my_dialog_active_minutes_in_date_range,friend_dialog_active_minutes_in_date_range
0,347963763,Назар Поночевний,,NazarPonochevnyi,male,Monday,23:30-07:30,482,450
1,138918380,Діма,Лопушанський,dmytrolopushanskyy,male,Monday,22:00-07:30,0,0
2,386414449,Solomiya,Lenio,sol4ik,female,Saturday,22:00-07:30,0,0


In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

ukr_words = ['зробив', 'робив', 'бачив',
             'сказав', 'розклав',
             'був', 'чув',
             'приніс', 'доніс',
             'з\'їв', 'всунув', 'розм\'як']

ru_words = ['сделал', 'делал',
            'видел',
             'сказал', 'расклал',
             'был', 'слышал',
             'принес', 'донес',
             'съел', 'всунул', 'размяк']

for word in ukr_words:
    print('\n' + word)
    print(morph.tag(word)[0].POS)
    print(morph.tag(word)[0].gender)


In [None]:
female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

female_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'female_ru_ukr_trans_names.csv'))
male_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'male_ru_ukr_trans_names.csv'))

# for filename in ['620593584.csv']:
for filename in os.listdir(PREPARED_FOLDER):
    print("\n\n\nfilename", filename)
    if not filename[1].isdigit():
        continue

    dialog_data = pd.read_csv(os.path.join(PREPARED_FOLDER, filename))
    if 'channel' in str(dialog_data['to_id'][0]):
        continue

    dialog_id = filename[:-4]
    try:
        with open(os.path.join(META_FOLDER, dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'{dialog_id} not found in {META_FOLDER}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "en": [female_ru_ukr_trans_names, male_ru_ukr_trans_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['first_name'] is not None:
            user_gender = get_user_gender(user['first_name'], name_dicts)

        if user['last_name'] is not None and user_gender == '':
            user_gender = get_user_gender(user['last_name'], name_dicts)

        if user_gender == '':
            if user['username'] is not None:
                if user['username'][-3:] == 'bot':
                    print(f'{user["username"]} is bot and we do not analyse it to get gender')
                    continue

            user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
            print(f'Final gender is {user_gender}')
        #
        # if user_gender == '':
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')


In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

ukr_words = ['зробив', 'робив', 'бачив',
             'сказав', 'розклав',
             'був', 'чув',
             'приніс', 'доніс',
             'з\'їв', 'всунув', 'розм\'як']

ru_words = ['сделал', 'делал',
            'видел',
             'сказал', 'расклал',
             'был', 'слышал',
             'принес', 'донес',
             'съел', 'всунул', 'размяк']

for word in ukr_words:
    print('\n' + word)
    print(morph.tag(word)[0].POS)
    print(morph.tag(word)[0].gender)
