In [1]:
import pandas as pd
import datetime as dt
import json
import copy
import glob
import re
import os

import gender_guesser.detector as gender

import ipywidgets as widgets
import altair as alt

from utils.text_data_transformation import url_to_domain, word_to_num, delete_special_characters, transform_raw_data
from utils.dialog_manipulation import add_subdialogs_ids, add_reply_time, detect_data_language, if_name_in_ukr_dict, \
    get_user_step_msgs

In [2]:
# Set up
DIALOGS_FOLDER = "data/new_type_dialogs/"
RESULT_FOLDER = "data/stats/"
PREPARED_FOLDER = 'data/new_type_dialogs_prepared/'
META_FOLDER = 'data/new_type_dialogs_meta'
GENERAL_DIALOGS_DF = 'data/processed_dialog_files2/general_df.csv'

In [None]:
# Initialisation
# Note: Your dialogs must be under data/prepared_dialogs/

dialogs = glob.glob(PREPARED_FOLDER + '/*.csv')
AVERAGE_STATS = {'word_count', 'msg_len', 'reply_time'}
user_stats = {}
basic_data = {'msg_count': 1,
              'word_count': [],
              'msg_len': [],
              'reply_time': []}
user_data = {
    'basic_data': copy.deepcopy(basic_data),
    'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)},
    'weeks_data': {day: {'basic_data': copy.deepcopy(basic_data),
                         'hours_data': {i: copy.deepcopy(basic_data) for i in range(24)}
                         }
                   for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                               'Friday', 'Saturday', 'Sunday']
                   }
}


def prepare_messages(data: pd.DataFrame) -> None:
    """
    Prepares each message in "message" column.
    """
    for index, row in data.iterrows():
        out_msg = []
        for word in str(row['message']).split():
            if url_to_domain(word, check=True):
                out_msg.append(url_to_domain(word))
            else:
                word = word_to_num(word)
                out_msg.append(delete_special_characters(word))
        data.loc[index, 'message'] = re.sub(r'\s\s+', ' ', ' '.join(out_msg))


def prepare_dialog(dialog_file: str, prep_folder: str) -> None:
    """
    Reads raw csv data and creates prepared copy
    """
    file_name = dialog_file.split('/')[-1]
    data = pd.read_csv(dialog_file)

    prepare_messages(data)
    add_reply_time(data)
    add_subdialogs_ids(data)

    data.to_csv(f'{prep_folder}{file_name}')


def get_stats(row: pd.DataFrame) -> dict:
    """
    Function to gather the information per row
    @ DataFrame.
    """
    stats = copy.deepcopy(basic_data)
    stats['msg_count'] = 1
    stats['word_count'].append(len(str(row['message']).split()))
    stats['msg_len'].append(len(str(row['message'])))
    if reply_time := row['reply_btw_sender_time']:
        stats['reply_time'].append(reply_time)
    return stats


def add_basic_stats(dialog_name: str, user_name: str, stats: dict) -> None:
    """
    Adds average statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    for key, value in stats.items():
        user_stats[dialog_name][user_name]['basic_data'][key] += value


def add_hour_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    hour = user_stats[dialog_name][user_name]['hours_data'][int(row['date'][11:13])]
    for key, value in stats.items():
        hour[key] += value


def add_week_stats(dialog_name: str, user_name: str, row: pd.DataFrame, stats: dict) -> None:
    """
    Adds average week days hourly statistics. Such as
    message count, word count, message length,
    reply_time.
    """
    day_name = dt.datetime.strptime(row['date'][:10], '%Y-%m-%d').strftime("%A")
    hour_name = int(row['date'][11:13])
    day = user_stats[dialog_name][user_name]['weeks_data'][day_name]
    for key, value in stats.items():
        day['basic_data'][key] += value
        day['hours_data'][hour_name][key] += value


def calculate_average(dialog_name: str) -> None:
    """
    Crawls across dictionary looking for
    average data keys and then calculates the average.
    """
    dialog = user_stats[dialog_name]

    def crawl_for_average_values(dialog):
        for key, value in dialog.items():
            if isinstance(value, dict):
                crawl_for_average_values(value)
            elif key in AVERAGE_STATS and value:
                dialog[key] = sum(value) / len(value)
            elif key in AVERAGE_STATS:
                dialog[key] = -1

    crawl_for_average_values(dialog)

In [None]:
# Checking the path
if os.path.isdir(DIALOGS_FOLDER):
    if not os.path.isdir(PREPARED_FOLDER):
        os.mkdir(PREPARED_FOLDER)

    if not os.path.isdir(RESULT_FOLDER):
        os.mkdir(RESULT_FOLDER)

In [None]:
# Preparing raw data
for dialog in glob.glob(DIALOGS_FOLDER + '/*.csv'):
    prepare_dialog(dialog, PREPARED_FOLDER)
    print('Done preparing file: ', dialog)

In [None]:
# Extracting basic information from all dialogs
for dialog_csv_path in dialogs:
    data = pd.read_csv(dialog_csv_path)
    dialog = dialog_csv_path[len(PREPARED_FOLDER):-4]

    if not user_stats.get(dialog):
        user_stats.setdefault(dialog, {})

    # Iterating over each user @ dialog
    for index, row in data.iterrows():
        user = str(row['from_id'])
        if not user_stats[dialog].get(user):
            user_stats[dialog].setdefault(user, copy.deepcopy(user_data))

        # Gathering data
        stats = get_stats(row)
        add_basic_stats(dialog, user, stats)
        add_hour_stats(dialog, user, row, stats)
        add_week_stats(dialog, user, row, stats)

    # Calculating average values for this user
    calculate_average(dialog)
    print("Done extracting data from dialog #", dialog, sep='')

In [None]:
# Dumping gathered data to data/basic_info.json
with open(f'{RESULT_FOLDER}dialog_members_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(user_stats, f, ensure_ascii=False)

In [None]:
# Selecting dialog id to plot data
dialogs = list(user_stats.keys())
dialog_id = widgets.Dropdown(
    options=dialogs,
    value=dialogs[0],
    description='Dialog:',
    disabled=False,
)
display(dialog_id)

In [None]:
# Selecting user id to plot data
users = list(user_stats[dialog_id.value].keys())
user_id = widgets.Dropdown(
    options=users,
    value=users[0],
    description='User:',
    disabled=False,
)
display(user_id)

In [None]:
# Reply time on y-axis, daytime hours on x-axis, radius of circles corespondes to the message number
# at that hour.

plotting_data = pd. \
    DataFrame({'Hour': range(24),
               'ReplyTime': [x['reply_time'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()],
               'MessageNumber': [x['msg_count'] for x in user_stats[dialog_id.value][user_id.value]['hours_data'].values()]
               }).sort_values(by='Hour')

alt.Chart(plotting_data).mark_circle(size=60) \
    .encode(x='Hour',
            y='ReplyTime',
            size='MessageNumber:Q',
            tooltip=['Hour', 'ReplyTime', 'MessageNumber']
            ).properties(width=800, height=300)

In [3]:
def get_user_gender(username, name_dicts):
    user_gender = ''

    first_name = username.strip().split()[0].lower()
    print('first_name', first_name)

    new_df = pd.DataFrame({'message': [first_name]})
    word_lang = detect_data_language(new_df, 'one_word')

    if word_lang in ('ru', 'ua'):
        for num_df, names_df in enumerate(name_dicts[word_lang]):
            if not names_df.loc[names_df['name'].str.lower() == first_name].empty or\
                    if_name_in_ukr_dict(first_name, names_df):
                if num_df == 0:
                    user_gender = 'female'
                else:
                    user_gender = 'male'
                print(f"{first_name} found in dict - it is {user_gender}\n")
                break

    elif word_lang == 'en':
        gender_detector = gender.Detector()
        user_gender = gender_detector.get_gender(first_name.capitalize())
        if user_gender == 'unknown':
            user_gender = ''
        print(f"{first_name} found in dict - it is {user_gender}\n")

    return user_gender


In [4]:
import pymorphy2
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

morph = pymorphy2.MorphAnalyzer()


def get_gender_by_verb(user_id, dialog_id):
    lang = detect_data_language('', 'df_loc', GENERAL_DIALOGS_DF, dialog_id, user_id)

    user_gender = ''
    if lang == 'en':
        return user_gender

    else:
        # TODO: filename 515120928.csv
        #         first_name findermusic
        #
        #         filename 1340446585.csv
        #         first_name h_bot29.06
        #         h_bot29.06 found in dict - it is


        dialog_step_msgs = get_user_step_msgs(GENERAL_DIALOGS_DF, dialog_id, user_id, 100)
        for msg in dialog_step_msgs:
            msg = transform_raw_data(msg, lang, '', '', 'without_lemma')
            for word in msg.split():
                print(word)
                if len(word) <= 2:
                    continue

                if lang == 'ua':
                    if word[-2:] == 'ла' and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                        print('ukr word', word)
                        user_gender = 'female'

                    elif word[-1] in ('в','к','с')  and morph.tag(word)[0].POS in ('VERB', 'GRND'):
                        print('ukr word', word)
                        user_gender = 'male'

                elif lang == 'ru':
                    segmenter = Segmenter()
                    doc = Doc(word)
                    emb = NewsEmbedding()
                    morph_tagger = NewsMorphTagger(emb)

                    doc.segment(segmenter)
                    doc.tag_morph(morph_tagger)

                    if word[-2:] == 'ла' and doc.tokens[0].pos in ('VERB', 'AUX'):
                        print('ru word', word)
                        user_gender = 'female'

                    if word[-1] in ('л','к','с') and doc.tokens[0].pos in ('VERB', 'AUX'):
                        print('ru word', word)
                        user_gender = 'male'

    return user_gender


# !pip install -U pymorphy2-dicts-uk



female_ukr_names = pd.read_csv(os.path.join('dicts', 'female_ukrainian_names.csv'))
male_ukr_names = pd.read_csv(os.path.join('dicts', 'male_ukrainian_names.csv'))
female_ru_names = pd.read_csv(os.path.join('dicts', 'female_russian_names.csv'))
male_ru_names = pd.read_csv(os.path.join('dicts', 'male_russian_names.csv'))

# for filename in ['620593584.csv']:
for filename in os.listdir(PREPARED_FOLDER):
    print("\n\n\nfilename", filename)
    if not filename[1].isdigit():
        continue

    dialog_data = pd.read_csv(os.path.join(PREPARED_FOLDER, filename))
    if 'channel' in str(dialog_data['to_id'][0]):
        continue

    dialog_id = filename[:-4]
    try:
        with open(os.path.join(META_FOLDER + '2', dialog_id + '.json'), 'r', encoding='utf-8') as f:
            meta_dialog_data = json.load(f)
    except FileNotFoundError:
        print(f'{dialog_id} not found in {META_FOLDER}')
        continue

    name_dicts = {
        "ru": [female_ru_names, male_ru_names],
        "ua": [female_ukr_names, male_ukr_names]
    }

    user_gender = ''
    for user in meta_dialog_data['users']:
        if user['first_name'] is not None:
            user_gender = get_user_gender(user['first_name'], name_dicts)

        if user['last_name'] is not None and user_gender == '':
            user_gender = get_user_gender(user['last_name'], name_dicts)

        if user_gender == '':
            user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
            print(f'Final gender is {user_gender}')
        #
        # if user_gender == '':
        #     user_gender = get_gender_by_verb(user['user_id'], int(dialog_id))
        #     print(f'Final gender is {user_gender}')





filename -1001006273516.csv



filename -1001019025346.csv



filename -1001039626561.csv



filename -1001042449782.csv



filename -1001044917656.csv



filename -1001057263640.csv



filename -1001084476048.csv



filename -1001095341243.csv



filename -1001112677952.csv



filename -1001113226920.csv



filename -1001117020066.csv



filename -1001117906346.csv



filename -1001119756543.csv



filename -1001119802548.csv



filename -1001120222237.csv



filename -1001126387620.csv



filename -1001132693664.csv



filename -1001135553470.csv



filename -1001138587070.csv



filename -1001140957884.csv



filename -1001143384902.csv



filename -1001143471470.csv



filename -1001150353283.csv



filename -1001163276781.csv



filename -1001163904855.csv



filename -1001168495971.csv



filename -1001173653750.csv



filename -1001178543893.csv



filename -1001181002461.csv



filename -1001199589364.csv



filename -1001207975494.csv



filename -1001213968975.csv



filen

  if n_row % msgs_step == 0:


In [8]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

from IPython.display import display
lemmatizer = nltk.WordNetLemmatizer()

#word tokenizeing and part-of-speech tagger
document = 'Напевно можна і не зізвонюватися так як все одно зараз проект не вибиримо точно. Бо ми не знаємо' \
           ' головного параметру по складності проектів які нам подобаються. Поки просто виділили найцікавіші' \
           ' нам та кожен вибирає один або пару той що хотів би робити'
# document = 'The little brown dog barked at the black cat'
tokens = [nltk.word_tokenize(sent) for sent in [document]]
print("tokens", tokens)
postag = [nltk.pos_tag(sent) for sent in tokens][0]

# Rule for NP chunk and VB Chunk
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        {<RB.?>*<VB.?>*<JJ>*<VB.?>+<VB>?} # Verbs and Verb Phrases

    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...

"""
#Chunking
cp = nltk.RegexpParser(grammar)

# the result is a tree
tree = cp.parse(postag)

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() =='NP'):
        yield subtree.leaves()

def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.NOUN

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    postag = get_word_postag(word)
    word = lemmatizer.lemmatize(word,postag)
    return word

def get_terms(tree):
    for leaf in leaves(tree):
        terms = [normalise(w) for w,t in leaf]
        yield terms

terms = get_terms(tree)

features = []
for term in terms:
    _term = ''
    for word in term:
        _term += ' ' + word
    features.append(_term.strip())

features


tokens [['Напевно', 'можна', 'і', 'не', 'зізвонюватися', 'так', 'як', 'все', 'одно', 'зараз', 'проект', 'не', 'вибиримо', 'точно', '.', 'Бо', 'ми', 'не', 'знаємо', 'головного', 'параметру', 'по', 'складності', 'проектів', 'які', 'нам', 'подобаються', '.', 'Поки', 'просто', 'виділили', 'найцікавіші', 'нам', 'та', 'кожен', 'вибирає', 'один', 'або', 'пару', 'той', 'що', 'хотів', 'би', 'робити']]


['напевно можна і не зізвонюватися так як все одно зараз проект не вибиримо точно',
 'бо',
 'ми не знаємо головного параметру по складності проектів які нам подобаються',
 'поки',
 'просто виділили найцікавіші нам та кожен вибирає один або пару той що хотів би робити']

In [8]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

ukr_words = ['зробив', 'робив', 'бачив',
             'сказав', 'розклав',
             'був', 'чув',
             'приніс', 'доніс',
             'з\'їв', 'всунув', 'розм\'як']

ru_words = ['сделал', 'делал',
            'видел',
             'сказал', 'расклал',
             'был', 'слышал',
             'принес', 'донес',
             'съел', 'всунул', 'размяк']

for word in ukr_words:
    print('\n' + word)
    print(morph.tag(word)[0].POS)
    print(morph.tag(word)[0].gender)



зробив
GRND
None

робив
GRND
None

бачив
GRND
None

сказав
GRND
None

розклав
NOUN
femn

був
None
None

чув
None
None

приніс
NOUN
masc

доніс
None
None

з'їв
GRND
None

всунув
GRND
None

розм'як
NOUN
masc
