In [1]:
import json
import pandas as pd
import os
import logging

import gender_guesser.detector as gender
from config import PATH_TO_DIALOGS_META, PATH_TO_SAVE_PROCESSED_FILES, PATH_TO_SAVE_GENERAL_DF, \
    USER_PATH_TO_SAVE_GENERAL_DF

from utils.text_data_transformation import transform_raw_data
from utils.dialog_manipulation import detect_data_language, \
    get_user_step_msgs, if_name_in_dict, add_sleep_bounds, add_subdialogs_stats

In [2]:
# Set up

# pd.set_option('display.max_rows', None)

In [3]:
# Checking the path

if not os.path.isfile(PATH_TO_SAVE_GENERAL_DF):
    logging.error(f'No Dataframe associated with {PATH_TO_SAVE_GENERAL_DF}')
else:
    df = pd.read_csv(PATH_TO_SAVE_GENERAL_DF)
    # df = df.rename(columns={'dialog ID': 'dialog_id'})

In [4]:
# Initialisation
def add_sleep_data(data: pd.DataFrame, user_df_path, save=True):
    """
    Add sleep data for each message in a dialog,
    and add it in a new column for a particular user,
    return new dataframe
    """
    gdf = pd.DataFrame(add_sleep_bounds(data))
    if save:
        gdf.to_csv(user_df_path, index=False)

def add_stats_data(data: pd.DataFrame, df_path, save=True):
    """
    Add mean data for each subdialogs in a dialog,
    and add it in a new column for a particular stats,
    return new dataframe
    """
    adf = add_subdialogs_stats(data)
    data['words_num_mean'] = adf['words_num_mean']
    data['reply_time_mean'] = adf['reply_time_mean']
    data['message_number_mean'] = adf['message_number_mean']
    if save:
        data.to_csv(df_path, index=False)


In [5]:
# Aggregating data
add_stats_data(df, PATH_TO_SAVE_GENERAL_DF)

In [6]:
# User stats
add_sleep_data(df, USER_PATH_TO_SAVE_GENERAL_DF)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sleep_end'] = data.apply(lambda x: datetime.datetime.strptime(x["date"][:19], time_format).hour, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sleep_start'] = data.apply(lambda x: (datetime.datetime.strptime(x["date"][:19], time_format) - datetime.timedelta(seconds=x['reply_btw_sender_time'])).hour, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [7]:
def get_user_gender(username, name_dicts):
    user_gender = ''

    first_name = username.strip().split()[0].lower()
    print('first_name', first_name)

    new_df = pd.DataFrame({'message': [first_name]})
    word_lang = detect_data_language(new_df, 'one_word')

    for num_df, names_df in enumerate(name_dicts[word_lang]):
        if not names_df.loc[names_df['name'].str.lower() == first_name].empty or\
                if_name_in_dict(first_name, names_df):
            if num_df == 0:
                user_gender = 'female'
            else:
                user_gender = 'male'
            print(f"{first_name} gender is {user_gender}\n")
            break

    if user_gender == '' and word_lang == 'en':
        gender_detector = gender.Detector()
        user_gender = gender_detector.get_gender(first_name.capitalize())
        if user_gender == 'unknown':
            user_gender = ''
        print(f"{first_name} gender is {user_gender}\n")

    return user_gender


In [8]:
import pymorphy2
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

morph = pymorphy2.MorphAnalyzer()


def get_gender_by_verb(user_id, dialog_id, user_general_df):
    lang = detect_data_language('', 'df_loc', PATH_TO_SAVE_GENERAL_DF, dialog_id, user_id)

    user_gender = ''
    if lang == 'en':
        return user_gender

    female_gender, male_gender = 0, 0
    dialog_step_msgs = get_user_step_msgs(PATH_TO_SAVE_GENERAL_DF, dialog_id, user_id, 100, user_general_df)

    for msg in dialog_step_msgs:
        msg = transform_raw_data(msg, lang, '', '', 'without_lemma')
        for word in msg.split():
            word = word.strip()
            print(word)
            if len(word) <= 2:
                continue

            if lang == 'ua':
                if word[-2:] == 'ла' and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    female_gender += 1

                elif word[-1] in ('в','к','с')  and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                    print('ukr word', word)
                    male_gender += 1

            elif lang == 'ru':
                segmenter = Segmenter()
                doc = Doc(word)
                emb = NewsEmbedding()
                morph_tagger = NewsMorphTagger(emb)

                doc.segment(segmenter)
                doc.tag_morph(morph_tagger)

                if word[-2:] == 'ла' and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    female_gender += 1


                elif word[-1] in ('л','к','с') and doc.tokens[0].pos in ('VERB', 'AUX'):
                    print('ru word', word)
                    male_gender += 1

    if male_gender > female_gender:
        user_gender = 'male'

    elif male_gender < female_gender:
        user_gender = 'female'

    else:
        user_gender = ''

    return user_gender

In [9]:
!pip install -U pymorphy2-dicts-uk
!pip install -U pymorphy2-dicts-ru

Requirement already up-to-date: pymorphy2-dicts-uk in d:\python\envs\venv_telegram_analysis\lib\site-packages (2.4.1.1.1460299261)
Requirement already up-to-date: pymorphy2-dicts-ru in d:\python\envs\venv_telegram_analysis\lib\site-packages (2.4.404381.4453942)


In [10]:
female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

female_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'female_ru_ukr_trans_names.csv'))
male_ru_ukr_trans_names = pd.read_csv(os.path.join('dicts', 'male_ru_ukr_trans_names.csv'))

In [11]:
members_statistics_df = pd.read_csv(USER_PATH_TO_SAVE_GENERAL_DF)

members_statistics_df['first_name'], members_statistics_df['last_name'],\
members_statistics_df['username'], members_statistics_df['gender'] = '', '', '', ''

In [12]:
for index, row in members_statistics_df.iterrows():
    user_id = row.user_id
    print(f'\n\n\n============{index + 1} users from {members_statistics_df.index[-1]} succeeded')

    general_dialogs_df = pd.read_csv(PATH_TO_SAVE_GENERAL_DF)
    user_general_df = general_dialogs_df.loc[general_dialogs_df['from_id'] == user_id]

    dialog_id = user_general_df['dialog ID'][user_general_df.index[0]]
    dialog_id = str(dialog_id)

    try:
        with open(os.path.join(PATH_TO_DIALOGS_META, dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'\n\n\n{dialog_id} not found in {PATH_TO_DIALOGS_META}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "en": [female_ru_ukr_trans_names, male_ru_ukr_trans_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['user_id'] == user_id:
            if user['first_name'] is not None:
                members_statistics_df.at[index, 'first_name'] = user['first_name']
                user_gender = get_user_gender(user['first_name'], name_dicts)

            if user['last_name'] is not None:
                members_statistics_df.at[index, 'last_name'] = user['last_name']
                if user_gender == '':
                    user_gender = get_user_gender(user['last_name'], name_dicts)

            if user['username'] is not None:
                members_statistics_df.at[index, 'username'] = user['username']
                if user_gender == '':
                    if user['username'][-3:] == 'bot':
                        print(f'{user["username"]} is bot and we do not analyse it to get gender')
                        continue

                    user_gender = get_gender_by_verb(user['user_id'], int(dialog_id), user_general_df)
                    print(f'Final gender is {user_gender}')

            if user_gender != '':
                members_statistics_df.at[index, 'gender'] = user_gender

            break

cols = ['user_id', 'first_name', "last_name", "username", "gender"]

rest_cols = [col for col in members_statistics_df.columns if col not in cols]

cols = cols + rest_cols
members_statistics_df = members_statistics_df[cols]
members_statistics_df.to_csv(USER_PATH_TO_SAVE_GENERAL_DF, index=False)
members_statistics_df




first_name ivan
data.index[-1] 0
n_msgs_to_analyse 150
ivan gender is male




first_name yurii
data.index[-1] 0
n_msgs_to_analyse 150
yurii gender is male




first_name vitaliia
data.index[-1] 0
n_msgs_to_analyse 150
vitaliia gender is 

first_name ioffe
data.index[-1] 0
n_msgs_to_analyse 150
ioffe gender is 

вообще
технический
сейлз
представляет
Final gender is 



first_name azim
data.index[-1] 0
n_msgs_to_analyse 150
azim gender is male




first_name max.d
data.index[-1] 0
n_msgs_to_analyse 150
max.d gender is 

Final gender is 



first_name quartermaster
data.index[-1] 0
n_msgs_to_analyse 150
quartermaster gender is 

Final gender is 



first_name sawyer
data.index[-1] 0
n_msgs_to_analyse 150
sawyer gender is male




first_name eugene
data.index[-1] 0
n_msgs_to_analyse 150
eugene gender is male




first_name hex
data.index[-1] 0
n_msgs_to_analyse 150
hex gender is 

нашлись
книжки
ищу
Final gender is 



first_name eugene
data.index[-1] 0
n_msgs_to_analyse 150
eugene gen

  if n_row % msgs_step == 0:
  result = method(y)


KeyboardInterrupt: 