In [34]:
%matplotlib inline
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
import plotly.graph_objects as go

from pprint import pprint
from utils.date import if_in_date_range
from plotly.subplots import make_subplots
from matplotlib.pyplot import figure
from utils.date import get_day_and_hour, get_week_day_from_number

from config import PATH_TO_SAVE_PROCESSED_FILES, PATH_TO_SAVE_GENERAL_DF, PATH_TO_PREPARED_DIALOGS, \
    USER_PATH_TO_SAVE_GENERAL_DF

### 1) Change global variables to yours
#### you can get your USER_ID from downloaded csv files
#### Be sure that USER_ID exists in your general dialogs dataframe !!!

In [35]:
USER_ID = 511986933

# time difference UTC (0 time zone) and your time zone
TIME_DIFFERENCE = 3
RESULT_NAME_STATISTICS = "dialogs_data2"

In [36]:
with open(os.path.join(PATH_TO_SAVE_PROCESSED_FILES, "all_dialogs_info.json"), "r", encoding="utf-8") as f:
    dialogs_info = json.load(f)

general_dialogs_df = pd.read_csv(PATH_TO_SAVE_GENERAL_DF)

## Plot of daily activity

In [37]:
def get_hours_dict():
    plot_data = {}
    for hour in range(24):
        if hour < 10:
            hour = '0' + str(hour)
            plot_data[hour] = 0
        else:
            plot_data[str(hour)] = 0
    return plot_data


def get_local_time(msg_time):
    our_local_time = int(msg_time) + TIME_DIFFERENCE
    if our_local_time > 23:
        our_local_time %= 24

    if our_local_time < 10:
        our_local_time = "0" + str(our_local_time)

    else:
        our_local_time = str(our_local_time)

    return our_local_time


def get_user_daily_activity(dialogs_info, dialog_type, user_id_get_msg, start_date, end_date,
                            func_type = ''):
    plot_data = get_hours_dict()

    # go through all dialogs on special dialog type in dialogs_info dict
    # friend - name of dialog
    for dialog_id, friend in dialogs_info[dialog_type].items():

        dialog_data = general_dialogs_df.loc[general_dialogs_df['dialog ID'] == int(dialog_id)]
        if dialog_data.empty:
            continue

        for index, row in dialog_data.iterrows():
            if user_id_get_msg != "all":
                if func_type == 'get_user_received_messages':
                    if int(row.from_id) == user_id_get_msg:
                        continue

                elif func_type == 'get_user_sent_messages':
                    if int(row.to_id) == user_id_get_msg:
                        continue

            dialog_datetime = str(row.date)[:-6]
            flag_in_range = if_in_date_range(dialog_datetime, start_date, end_date)
            if flag_in_range == 'Dialog after end_date':
                continue

            elif not flag_in_range:
                break

            elif flag_in_range:
                our_local_time = get_local_time(str(row.date)[11:13])
                plot_data[our_local_time] += 1

    lists = sorted(plot_data.items())
    x, y = zip(*lists)

    return x, y, lists


In [38]:
fig_week_sent_msgs = make_subplots(rows=7, cols=1,
               subplot_titles=("Monday", "Tuesday", "Wednesday", "Thursday",
                               "Friday", "Saturday", "Sunday"),
               shared_yaxes=True,
               shared_xaxes=True,
               x_title="Hours",
               y_title="Number of messages",
               horizontal_spacing=0.005,
                )

fig_week_received_msgs = make_subplots(rows=7, cols=1,
               subplot_titles=("Monday", "Tuesday", "Wednesday", "Thursday",
                               "Friday", "Saturday", "Sunday"),
               shared_yaxes=True,
               shared_xaxes=True,
               x_title="Hours",
               y_title="Number of messages",
               horizontal_spacing=0.005,
                )


fig_week_sent_msgs.update_layout(title_text=f"Weekly sent messages of {USER_ID}", height=1050, showlegend=False)
fig_week_received_msgs.update_layout(title_text=f"Weekly received messages of {USER_ID}", height=1050, showlegend=False)

### Change START_WEEK_YEAR, START_WEEK_MONTH and START_WEEK_DAY variables to yours to get activity per day during this week range
#### Be sure that this week date is in your general dialogs dataframe from 0_data_preprocessing.py
#### example 11 means 10-11 time range of chatting

In [39]:
START_WEEK_YEAR = 2020
START_WEEK_MONTH = 8
START_WEEK_DAY = 28

if START_WEEK_MONTH in [1, 3, 5, 7, 8, 10, 12]:
    days_in_month = 31

elif START_WEEK_MONTH == 2:
    days_in_month = 28

else:
    days_in_month = 30

for i in range(7):
    week_day = START_WEEK_DAY + i
    next_day = START_WEEK_DAY + i + 1
    day_month = START_WEEK_MONTH
    next_day_month = START_WEEK_MONTH

    if week_day >= days_in_month + 1:
        week_day = week_day % (days_in_month + 1) + 1
        day_month = START_WEEK_MONTH + 1

    if next_day >= days_in_month + 1:
        next_day = next_day % (days_in_month + 1) + 1
        next_day_month = START_WEEK_MONTH + 1

    start_date = datetime.datetime(START_WEEK_YEAR, day_month,
                                    week_day, 0, 0, 0)

    end_date = datetime.datetime(START_WEEK_YEAR, next_day_month,
                                    next_day, 0, 0, 0)

    hours_data, n_msgs, lists = get_user_daily_activity(dialogs_info, "Private dialog", USER_ID, start_date, end_date,
                                                        'get_user_sent_messages')

    day_of_week = i + 1

    fig_week_sent_msgs.add_scatter(x=hours_data,
                        y=n_msgs,
                        name=get_week_day_from_number(day_of_week),
                        row=day_of_week, col=1, line=dict(color="#228B22")
                       )

    hours_data, n_msgs, lists = get_user_daily_activity(dialogs_info, "Private dialog", USER_ID, start_date, end_date,
                                                        'get_user_received_messages')

    fig_week_received_msgs.add_scatter(x=hours_data,
                        y=n_msgs,
                        name=get_week_day_from_number(day_of_week),
                        row=day_of_week, col=1, line=dict(color="#ff0000")
                       )


fig_week_sent_msgs.show()
fig_week_received_msgs.show()

## Collect general statistics for user during start_date - end_date

In [40]:
def get_statistics_msgs(df, user_id_get_msg, dialog_type, statistics_type,
                        start_date, end_date):
    """

    :param df: your dataframe
    :param user_id_get_msg: int, user id
    :param dialog_type: "Channel" or "Private dialog" or "Group"
    :param statistics_type: "received" or "sent", to analyse received or sent messages
    :param start_date: datetime type, from what time start to analyse msgs
    :param end_date: datetime type, to what time to analyse msgs
    :return: added dataframe with statistics in range (start_date, end_date) and
    average_n_msgs, average_n_words
    """
    dialogs_places_dict = {}

    # go through all dialogs on special dialog type in dialogs_info dict
    # friend - name of dialog
    msgs_lst, words_lst = [], []
    for dialog_id, friend in dialogs_info[dialog_type].items():
        dialog_data = general_dialogs_df.loc[general_dialogs_df['dialog ID'] == int(dialog_id)]
        if dialog_data.empty:
            msgs_lst.append(0)
            words_lst.append(0)
            dialogs_places_dict[friend] = 0
            continue

        # if it is Group, so to get number of received messages to you in it
        # it is sent to_id like PeerChannel(channel_id=1387547322), it is not your id
        # so we change it
        if dialog_type == "Group" and statistics_type == "received":
            user_id_get_msg = dialog_data["to_id"][dialog_data.index[0]]

        n_msgs_in_date_range, n_words_in_date_range = 0, 0
        dialogs_places_dict[friend] = 0

        for index, row in dialog_data.iterrows():
            dialog_datetime = row.date[:-6]
            flag_in_range = if_in_date_range(dialog_datetime, start_date, end_date)
            if flag_in_range == 'Dialog after end_date':
                continue

            if not flag_in_range:
                break

            if statistics_type == "received":
                if row.from_id == user_id_get_msg:
                    continue
                    
            elif statistics_type == "sent":
                if row.from_id != user_id_get_msg:
                    continue

            if not pd.isnull(row.message):
                n_words_in_date_range += len(row.message.split())
            dialogs_places_dict[friend] = dialogs_places_dict.get(friend, 0) + 1

        words_lst.append(n_words_in_date_range)

    dialogs_places_dict = {k: v for k, v in sorted(dialogs_places_dict.items(), key=lambda item: item[1], reverse=True)}
    msgs_lst = []
    for i in df.index:
        msgs_lst.append(dialogs_places_dict[df["dialog_name"][i]])

    df["n_{}_msgs_in_date_range".format(statistics_type)] = msgs_lst
    df["n_{}_words_in_date_range".format(statistics_type)] = words_lst

    df["place_dialog_by_n_{}_msgs".format(statistics_type)] = [0 for _ in range(len(df.index))]

    print("Top 10 {} by {} messages during {} - {}".format(dialog_type, statistics_type, start_date, end_date))
    for place, friend in enumerate(dialogs_places_dict.items()):
        if place < 10:
            print("{} place is {} with {} messages".format(place + 1, friend[0], friend[1]))
        n_row = df.loc[df['dialog_name'] == friend[0]]["id"]
        df["place_dialog_by_n_{}_msgs".format(statistics_type)][n_row] = place + 1

    print("\n\n")
    df["n_{}_msgs_in_date_range".format(statistics_type)] = df["n_{}_msgs_in_date_range".format(statistics_type)].fillna(0)
    df["n_{}_words_in_date_range".format(statistics_type)] = df["n_{}_words_in_date_range".format(statistics_type)].fillna(0)

    average_n_msgs = int(df["n_{}_msgs_in_date_range".format(statistics_type)].mean(skipna = True))
    average_n_words = int(df["n_{}_words_in_date_range".format(statistics_type)].mean(skipna = True))

    return df, average_n_msgs, average_n_words


### 2) Change start_date and end_date to yours

In [41]:
# from what time start to analyse msgs - to what time to analyse msgs
# set value to dates which are more on 1 day that days you want to be include
# ex. if you set:
# start_date = datetime.datetime(2020, 9, 1, 0, 0, 0)
# end_date = datetime.datetime(2020, 9, 11, 0, 0, 0)
# so real date range to analyse is 2.09.2020 - 10.09.2020
start_date = datetime.datetime(2020, 9, 1, 0, 0, 0)
end_date = datetime.datetime(2020, 9, 11, 0, 0, 0)

# get general statistics
df = pd.DataFrame(dialogs_info["Private dialog"].items(), columns=["dialog_id", "dialog_name"])


## Top people with whom communicate during start_date - end_date

In [42]:
if len(df.index) != 0:
    df["id"] = [i for i in range(df.index[-1] + 1)]

    df, average_n_received_msgs, average_n_received_words = get_statistics_msgs(df, USER_ID, "Private dialog", "received",
                                                                                start_date, end_date)
    df, average_n_sent_msgs, average_n_sent_words = get_statistics_msgs(df, USER_ID, "Private dialog", "sent",
                                                                        start_date, end_date)

    new_row = {
        "dialog_name": "Total average statistics msgs",
        "dialog_id": -1,
        "n_received_msgs_in_date_range": average_n_received_msgs,
        "n_received_words_in_date_range": average_n_received_words,
        "n_sent_msgs_in_date_range": average_n_sent_msgs,
        "n_sent_words_in_date_range": average_n_sent_words,
        "place_dialog_by_n_received_msgs": -1,
        "place_dialog_by_n_sent_msgs": -1
    }

    df = df.append(new_row, ignore_index=True)

    df.to_csv(os.path.join(PATH_TO_SAVE_PROCESSED_FILES, f"private_{RESULT_NAME_STATISTICS}.csv"))
    df

Top 10 Private dialog by received messages during 2020-09-01 00:00:00 - 2020-09-11 00:00:00
1 place is Andrew Kurochkin with 52 messages
2 place is Марк with 49 messages
3 place is Назар Поночевний with 44 messages
4 place is Саша Дерен ЕПЕ with 42 messages
5 place is Бек Андрій Уку with 9 messages
6 place is Real Python RSS with 6 messages
7 place is Костя Лєпєшов with 3 messages
8 place is Обходные стратегии with 1 messages
9 place is Denys Herasymuk with 0 messages
10 place is Микола Карате with 0 messages



Top 10 Private dialog by sent messages during 2020-09-01 00:00:00 - 2020-09-11 00:00:00
1 place is Саша Дерен ЕПЕ with 57 messages
2 place is Andrew Kurochkin with 45 messages
3 place is Марк with 45 messages
4 place is Denys Herasymuk with 30 messages
5 place is Назар Поночевний with 27 messages
6 place is Бек Андрій Уку with 23 messages
7 place is Микола Карате with 3 messages
8 place is Костя Лєпєшов with 2 messages
9 place is Real Python RSS with 0 messages
10 place is Обхо



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Top groups with whom communicate during start_date - end_date

In [43]:
# get general statistics
df_groups = pd.DataFrame(dialogs_info["Group"].items(), columns=["dialog_id", "dialog_name"])
if len(df_groups.index) != 0:
    df_groups["id"] = [i for i in range(df_groups.index[-1] + 1)]

    df_groups, average_n_received_msgs, average_n_received_words = get_statistics_msgs(df_groups, "all", "Group", "received",
                                                                                start_date, end_date)
    df_groups, average_n_sent_msgs, average_n_sent_words = get_statistics_msgs(df_groups, USER_ID, "Group", "sent",
                                                                        start_date, end_date)

    new_row = {
        "dialog_name": "Total average statistics msgs",
        "dialog_id": -1,
        "n_received_msgs_in_date_range": average_n_received_msgs,
        "n_received_words_in_date_range": average_n_received_words,
        "n_sent_msgs_in_date_range": average_n_sent_msgs,
        "n_sent_words_in_date_range": average_n_sent_words,
        "place_dialog_by_n_received_msgs": -1,
        "place_dialog_by_n_sent_msgs": -1
    }

    df = df.append(new_row, ignore_index=True)

    df.to_csv(os.path.join(PATH_TO_SAVE_PROCESSED_FILES, f"groups_{RESULT_NAME_STATISTICS}.csv"), index=False)
    df

Top 10 Group by received messages during 2020-09-01 00:00:00 - 2020-09-11 00:00:00
1 place is BookCrossing - Обмен книгами with 100 messages
2 place is React Kyiv with 100 messages
3 place is Arch Linux RU with 100 messages
4 place is УКУ Рандом | REBOOTED with 100 messages
5 place is APPS@UCU (всі) ❤️ with 100 messages
6 place is CS & BA 2019 💚💜 with 100 messages
7 place is sns_internships with 100 messages
8 place is Linux@ucu with 100 messages
9 place is d2v with 100 messages
10 place is Крильце Надії with 56 messages



Top 10 Group by sent messages during 2020-09-01 00:00:00 - 2020-09-11 00:00:00
1 place is d2v with 42 messages
2 place is Group with 22 messages
3 place is Курси та навчання with 13 messages
4 place is BookCrossing - Обмен книгами with 0 messages
5 place is React Kyiv with 0 messages
6 place is Arch Linux RU with 0 messages
7 place is УКУ Рандом | REBOOTED with 0 messages
8 place is APPS@UCU (всі) ❤️ with 0 messages
9 place is CS & BA 2019 💚💜 with 0 messages
10 plac



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Time spending on conversations during start_date - end_date

#### 3) Change to yours next global variables

In [44]:
# if time between messages is more TIME_BETWEEN_USER_REPLIES sec,
# so the dialog is stopped and
# we should not add this time to general time
TIME_BETWEEN_USER_REPLIES = 300

# WORDS_SPEED_READING per a minute
# Reference: https://irisreading.com/what-is-the-average-reading-speed/
# Due to reading chat messages(a lot of little messages) this speed decrease to 2.5 words per a second
WORDS_SPEED_READING = 150

# WORDS_SPEED_READING per a minute
CHAR_SPEED_WRITING = 160

# time for thinking about what you will be write
TIME_FOR_THINKING = 30


In [45]:
def get_user_active_minutes_per_day(dialogs_info, start_date, end_date, dialog_type, user_id_get_msg):
    """

    :param dialogs_info: dict: first level keys - "Channel", "Private dialog", "Group"
    second level items - "<name_of_dialog>": <id>
    :param start_date: datetime type, from what time start to analyse msgs
    :param end_date: datetime type, to what time to analyse msgs
    :param dialog_type: "Channel" or "Private dialog" or "Group"
    :param user_id_get_msg: int, user id
    :return: lists for a plot of "top_friends" or "daily_activity", depends on
    your input plot_type
    """
    dialog_active_seconds, dialog_active_minutes, previous_message_time = 0, 0, 0

    # go through all dialogs on special dialog type in dialogs_info dict
    # friend - name of dialog
    print(f"Active minutes per day({start_date}) in {dialog_type}")

    for dialog_id, friend in dialogs_info[dialog_type].items():
        start_dialog_time, stop_dialog_time = 0, 0

        dialog_data = general_dialogs_df.loc[general_dialogs_df['dialog ID'] == int(dialog_id)]
        if dialog_data.empty:
            continue

        dialog_active_seconds_before = dialog_active_seconds
        flag_first_my_message = 0

        for index, row in dialog_data.iterrows():
            if pd.isnull(row.message):
                continue

            dialog_datetime = row.date[:-6]
            flag_in_range = if_in_date_range(dialog_datetime, start_date, end_date)
            if flag_in_range == 'Dialog after END_DATE':
                continue

            elif not flag_in_range:
                break

            elif flag_in_range:
                if user_id_get_msg != "all":
                    if row.from_id != user_id_get_msg:
                        dialog_active_seconds += len(row.message.split()) / WORDS_SPEED_READING * 60
                        continue
                
                dialog_datetime = datetime.datetime.strptime(row.date[:-6],
                                                                         "%Y-%m-%d %H:%M:%S")
                if flag_first_my_message == 0:
                    previous_message_time = dialog_datetime
                    flag_first_my_message = 1

                time_before_my_messages = previous_message_time - dialog_datetime

                # if message in user_id_get_msg queue of messages
                if index != dialog_data.index[-1] and dialog_data["from_id"][index + 1] == user_id_get_msg and \
                        time_before_my_messages.total_seconds() < 600:
                    previous_message_time = dialog_datetime
                    if stop_dialog_time == 0:
                        stop_dialog_time = dialog_datetime

                else:
                    TIME_FOR_FIRST_YOUR_MESSAGE = TIME_FOR_THINKING + \
                                                  len(row.message) / CHAR_SPEED_WRITING * 60
                    if stop_dialog_time == 0:
                        # if in user_id_get_msg queue of messages is only 1 message so add
                        # time for:
                        # reading received messages + thinking + writing the message
                        dialog_active_seconds += TIME_FOR_FIRST_YOUR_MESSAGE

                    else:
                        start_dialog_time = dialog_datetime

                        # get time between your query of messages
                        time_since = stop_dialog_time - start_dialog_time

                        # if time between messages is more TIME_BETWEEN_USER_REPLIES sec,
                        # so the dialog is stopped and
                        # we should not add this time to general time
                        if time_since.total_seconds() >= TIME_BETWEEN_USER_REPLIES:
                            time_since = stop_dialog_time - previous_message_time
                            dialog_active_seconds += time_since.total_seconds() + TIME_FOR_FIRST_YOUR_MESSAGE

                        else:
                            # add TIME_FOR_FIRST_YOUR_MESSAGE sec - time for reading + writing first message
                            dialog_active_seconds += time_since.total_seconds() + TIME_FOR_FIRST_YOUR_MESSAGE

                    previous_message_time = dialog_datetime
                    start_dialog_time, stop_dialog_time = 0, 0

        if (dialog_active_seconds - dialog_active_seconds_before) // 60 > 0:
            print("Time spend on conversation: dialog {} {} minutes".format(friend,
                                                                            round((dialog_active_seconds -
                                                                                   dialog_active_seconds_before) / 60, 2)))
    print("\n")
    return round(dialog_active_seconds / 60, 2)


### 4) Change START_WEEK_YEAR, START_WEEK_MONTH and START_WEEK_DAY to yours
#### Be sure to input date of messages which exists in your data, NOT date in the future (add 365 to your START_WEEK_DAY
#### to understand if it exists in your data) !!!

In [46]:
START_WEEK_YEAR = 2020
START_WEEK_MONTH = 9
START_WEEK_DAY = 1

plot_data = {}


week_month_year_data = []

for j in [7, 30, 365]:
    week_day = START_WEEK_DAY
    next_day = START_WEEK_DAY + 1
    day_month = START_WEEK_MONTH
    next_day_month = START_WEEK_MONTH
    day_year = START_WEEK_YEAR
    next_day_year = START_WEEK_YEAR

    for i in range(j):
        if day_month in [1, 3, 5, 7, 8, 10, 12]:
            days_in_month = 31

        elif day_month == 2:
            days_in_month = 28

        else:
            days_in_month = 30

        if i != 0:
            week_day += 1
            next_day += 1

        if week_day >= days_in_month + 1:
            week_day = week_day % (days_in_month + 1) + 1
            day_month += 1

        if next_day >= days_in_month + 1:
            next_day = next_day % (days_in_month + 1) + 1
            next_day_month += 1

        if day_month >= 13:
            day_month = day_month % 13 + 1
            day_year += 1

        if next_day_month >= 13:
            next_day_month = next_day_month % 13 + 1
            next_day_year += 1

        start_date = datetime.datetime(day_year, day_month,
                                        week_day, 0, 0, 0)

        end_date = datetime.datetime(next_day_year, next_day_month,
                                        next_day, 0, 0, 0)

        general_dialog_time = get_user_active_minutes_per_day(dialogs_info, start_date, end_date, "Private dialog", USER_ID) \
                              + get_user_active_minutes_per_day(dialogs_info, start_date, end_date, "Group", USER_ID)
        plot_data[datetime.date(day_year, day_month, week_day)] = general_dialog_time

    lists = sorted(plot_data.items())
    dates, n_minutes_lst = zip(*lists)
    week_month_year_data.append([dates, n_minutes_lst])

Active minutes per day(2020-08-01 00:00:00) in Private dialog


Active minutes per day(2020-08-01 00:00:00) in Group


Active minutes per day(2020-08-02 00:00:00) in Private dialog


Active minutes per day(2020-08-02 00:00:00) in Group


Active minutes per day(2020-08-03 00:00:00) in Private dialog


Active minutes per day(2020-08-03 00:00:00) in Group


Active minutes per day(2020-08-04 00:00:00) in Private dialog
Time spend on conversation: dialog Бек Андрій Уку 5.44 minutes


Active minutes per day(2020-08-04 00:00:00) in Group


Active minutes per day(2020-08-05 00:00:00) in Private dialog
Time spend on conversation: dialog Denys Herasymuk 6.35 minutes


Active minutes per day(2020-08-05 00:00:00) in Group


Active minutes per day(2020-08-06 00:00:00) in Private dialog
Time spend on conversation: dialog Denys Herasymuk 6.6 minutes


Active minutes per day(2020-08-06 00:00:00) in Group


Active minutes per day(2020-08-07 00:00:00) in Private dialog


Active minutes per day(2020-08-0

In [47]:
import plotly.graph_objects as px

plot_titles = ['Weekly active minutes in Telegram',
               'Monthly active minutes in Telegram',
               'Annually active minutes in Telegram']

for n_data, period_data in enumerate(week_month_year_data):
    plot = px.Figure(data=[go.Bar(
        name = 'Data 1',
        x = period_data[0],
        y = period_data[1]
       )])

    plot.update_layout(
        title_text=plot_titles[n_data],
        autosize=False,
        width=800,
        height=800,
        yaxis = dict(
                title='Number of minutes',
                titlefont_size=16,
                tickfont_size=14),
        xaxis = dict(
                title='Date',
                titlefont_size=16,
                tickfont_size=14)
    )

    plot.show()


In [48]:
def convert_to_datetime_column(dialog_data):
    dialog_data['date'] = pd.to_datetime(dialog_data['date']).dt.tz_localize(None)


def get_read_write_time_on_msg(dialog_data, user_id_get_msg, start_date, end_date,
                               column_name_write, column_name_read):
    filtered_dialog_data = dialog_data[(start_date <= dialog_data['date']) & (dialog_data['date'] <= end_date)]

    flag_first_my_message, previous_message_time = 0, ''
    start_dialog_time, stop_dialog_time = 0, 0
    dialog_active_seconds = 0

    for index, row in filtered_dialog_data.iterrows():
        if pd.isnull(row.message):
            continue

        if user_id_get_msg != "all":
            if row.from_id != user_id_get_msg:
                general_dialogs_df.at[index, column_name_read] = len(row.message.split()) / WORDS_SPEED_READING * 60
                general_dialogs_df.at[index, column_name_write] =  0
                dialog_active_seconds += len(row.message.split()) / WORDS_SPEED_READING * 60
                continue

        if flag_first_my_message == 0:
            previous_message_time = row.date
            flag_first_my_message = 1

        time_before_my_messages = previous_message_time - row.date

        TIME_FOR_READING_MESSAGE = len(row.message) / CHAR_SPEED_WRITING * 60
        general_dialogs_df.at[index, column_name_read] = 0
        general_dialogs_df.at[index, column_name_write] = TIME_FOR_READING_MESSAGE

        # if message in user_id_get_msg queue of messages
        if index < dialog_data.index[-1] and dialog_data["from_id"][index + 1] == user_id_get_msg and \
                time_before_my_messages.total_seconds() < TIME_BETWEEN_USER_REPLIES:
            previous_message_time = row.date
            if stop_dialog_time == 0:
                stop_dialog_time = row.date

        else:
            TIME_FOR_FIRST_YOUR_MESSAGE = TIME_FOR_THINKING + \
                                          len(row.message) / CHAR_SPEED_WRITING * 60
            if stop_dialog_time == 0:
                # if in user_id_get_msg queue of messages is only 1 message so add
                # time for:
                # reading received messages + thinking + writing the message
                dialog_active_seconds += TIME_FOR_FIRST_YOUR_MESSAGE

            else:
                start_dialog_time = row.date

                # get time between your query of messages
                time_since = stop_dialog_time - start_dialog_time

                # if time between messages is more TIME_BETWEEN_USER_REPLIES sec,
                # so the dialog is stopped and
                # we should not add this time to general time
                if time_since.total_seconds() >= TIME_BETWEEN_USER_REPLIES:
                    time_since = stop_dialog_time - previous_message_time
                    dialog_active_seconds += time_since.total_seconds() + TIME_FOR_FIRST_YOUR_MESSAGE

                else:
                    # add TIME_FOR_FIRST_YOUR_MESSAGE sec - time for reading + writing first message
                    dialog_active_seconds += time_since.total_seconds() + TIME_FOR_FIRST_YOUR_MESSAGE


            previous_message_time = row.date
            start_dialog_time, stop_dialog_time = 0, 0

    return round(dialog_active_seconds / 60, 2)

In [49]:
members_statistic_df = pd.read_csv(USER_PATH_TO_SAVE_GENERAL_DF)

### 5) Change global variables to yours

In [50]:
user_id_get_msg = USER_ID

# from what time start to analyse msgs - to what time to analyse msgs
# set value to dates which are more on 1 day that days you want to be include
# ex. if you set:
# start_date = datetime.datetime(2020, 9, 1, 0, 0, 0)
# end_date = datetime.datetime(2020, 9, 11, 0, 0, 0)
# so real date range to analyse is 2.09.2020 - 10.09.2020
start_date = datetime.datetime(2020, 8, 1, 0, 0, 0)
end_date = datetime.datetime(2020, 9, 11, 0, 0, 0)

In [51]:
members_statistic_df["my_dialog_active_minutes_in_date_range"] = -1
members_statistic_df["friend_dialog_active_minutes_in_date_range"] = -1
general_dialogs_df["my_time_seconds_read"], general_dialogs_df["my_time_seconds_write"] = -1, -1
general_dialogs_df["friend_time_seconds_read"], general_dialogs_df["friend_time_seconds_write"] = -1, -1

for index, row in members_statistic_df.iterrows():
    dialog_df = general_dialogs_df.loc[general_dialogs_df['dialog ID'] == row.user_id]
    if dialog_df.empty:
        continue

    convert_to_datetime_column(dialog_df)

    my_dialog_active_minutes_in_date_range = get_read_write_time_on_msg(dialog_df, user_id_get_msg,
                                                                                   start_date, end_date,
                                                                        'my_time_seconds_write',
                                                                        'my_time_seconds_read'
                                                                        )
    if dialog_df["from_id"][dialog_df.index[0]] == user_id_get_msg:
        friend_id_get_msg = dialog_df["to_id"][dialog_df.index[0]]
    else:
        friend_id_get_msg = dialog_df["from_id"][dialog_df.index[0]]

    friend_dialog_active_minutes_in_date_range = get_read_write_time_on_msg(dialog_df, friend_id_get_msg,
                                                                                   start_date, end_date,
                                                                            'my_time_seconds_write',
                                                                            'my_time_seconds_read'
                                                                            )
    print(f'My time spent on conversation with {row.first_name} -- {my_dialog_active_minutes_in_date_range} min')
    print(f'Friend\'s time spent on conversation with {row.first_name} -- {friend_dialog_active_minutes_in_date_range} min')
    print('\n')
    members_statistic_df.at[index, "my_dialog_active_minutes_in_date_range"] = my_dialog_active_minutes_in_date_range
    members_statistic_df.at[index, "friend_dialog_active_minutes_in_date_range"] = friend_dialog_active_minutes_in_date_range



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



My time spent on conversation with nan -- 42.95 min
Friend's time spent on conversation with nan -- 7.11 min


My time spent on conversation with nan -- 22.83 min
Friend's time spent on conversation with nan -- 34.41 min


My time spent on conversation with nan -- 0.05 min
Friend's time spent on conversation with nan -- 1.07 min


My time spent on conversation with nan -- 24.25 min
Friend's time spent on conversation with nan -- 3.48 min


My time spent on conversation with nan -- 115.4 min
Friend's time spent on conversation with nan -- 11.63 min


My time spent on conversation with nan -- 2.19 min
Friend's time spent on conversation with nan -- 0.33 min


My time spent on conversation with nan -- 36.37 min
Friend's time spent on conversation with nan -- 36.44 min


My time spent on conversation with nan -- 8.27 min
Friend's time spent on conversation with nan -- 59.59 min


My time spent on conversation with nan -- 39.51 min
Friend's time spent on conversation with nan -- 3.71 min




In [52]:
members_statistic_df.to_csv(USER_PATH_TO_SAVE_GENERAL_DF, index=False)
print(f'Check {USER_PATH_TO_SAVE_GENERAL_DF} to see result statistics')
members_statistic_df

Check data\processed_dialog_files\user_stats.csv to see result statistics


Unnamed: 0,user_id,first_name,last_name,username,gender,get_up_hour,go_bed_hour,my_dialog_active_minutes_in_date_range,friend_dialog_active_minutes_in_date_range
0,4.189576e+07,,,,,14.0,20.0,-1,-1
1,5.241456e+07,,,,,10.0,11.0,-1,-1
2,5.411751e+07,,,,,13.0,14.0,-1,-1
3,5.515643e+07,,,,,2.0,2.0,-1,-1
4,5.641309e+07,,,,,6.0,6.0,-1,-1
...,...,...,...,...,...,...,...,...,...
293,1.266328e+09,,,,,-1.0,-1.0,-1,-1
294,1.293020e+09,,,,,-1.0,-1.0,-1,-1
295,1.348358e+09,,,,,6.0,9.0,-1,-1
296,1.353513e+09,,,,,13.0,14.0,-1,-1
