In [None]:
import pandas as pd
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from textblob import TextBlob
from datetime import datetime, timezone, timedelta

In [None]:
data = pd.read_csv("data/processed_dialog_files/general_dialogs_df.csv")



In [None]:
# TODO: Move these functions to utils/ 

# ______________________________________

def get_ua_tonality_dict(dict_path="dicts/tone-dict-ua.csv"):
    """
    Read dictionary of ukrainian words tonality and
    return is a a python dict
    """
    sentiment_data = pd.read_csv(dict_path)
    sentiment_dict = dict(zip(sentiment_data.iloc[:, 1], sentiment_data.iloc[:, 3]))

    return sentiment_dict 

def get_ru_tonality_dict(dict_path="dicts/tone-dict-ru.csv"):
    """
    Read dictionary of russian words tonality and
    return is a a python dict
    """
    sentiment_data = pd.read_csv(dict_path, sep=";")
    sentiment_dict = dict(zip(sentiment_data.iloc[:, 0], sentiment_data.iloc[:, 2]))
    
    return sentiment_dict 


def get_day_and_hour(date):
    """
    Parse date string, 
    return hour and day of a week
    """
    date = datetime.fromisoformat(date)
    
    return {'hour': date.hour, 'day': date.isoweekday()}


def get_week_day_from_number(week_day_num):
    """
    Get weekday string from a weekday number
    """
    week_days_by_num = {
        1: "Monday",
        2: "Tuesday",
        3: "Wednesday",
        4: "Thursday",
        5: "Friday",
        6: "Saturday",
        7: "Sunday"
    }
    
    return week_days_by_num[week_day_num]

# __________________________________


tonality_dict_ua = get_ua_tonality_dict()
tonality_dict_ru = get_ru_tonality_dict()


def calculate_msg_sentiment(msg, lang):
    """
    Calculate dictionary-base sentiment of a particular message.
    Return average sentiment of a message.
    
    Works for en, ua, ru
    """
    if not isinstance(msg, str):
        return 0
    
    if lang == "en":     
        return round(TextBlob(msg).sentiment.polarity, 4)
    elif lang == "ua":
        tonality_dict = tonality_dict_ua
    elif lang == "ru":
        tonality_dict = tonality_dict_ru
           
    tokenized_msg = msg.split()  
    overall_sentiment = 0
    words_num = 0
    
    for token in tokenized_msg:             
        token_sentiment = tonality_dict.get(token.lower(), 0)
        if token_sentiment:
            overall_sentiment += token_sentiment
            words_num += 1
    
    if not words_num:
        return 0
    
    avg_sentiment = overall_sentiment / words_num
    
    return avg_sentiment



def add_dialog_sentiment(data):
    """
    Add sentiment for each message in a dialog,
    and add it in a new column for a particular message,
    return new dataframe
    
    Supported languages: eng, ru, ua
    """
    data['msg_sentiment'] = data.apply(lambda x: calculate_msg_sentiment(x['preprocessed_message'], 
                                                                         x['dialog_language']), 
                                       axis=1)
    
    return data


# Save dataframe with sentiment as a physical file :

# result_path="data/processed_dialog_files/general_dialogs_sentiment.csv"
# add_dialog_sentiment(data).to_csv(result_path, index=False)
    

def calculate_avg_subdialog_sentiment(data):
    """
    Calculate average sentiment for each subdialog in a dataframe,
    save results in a dictionary in form of:
    
    {*USER_ID* : {*DIALOG_ID* : {*SUBDIALOG_ID* : *AVG_SENTIMENT*, ... }, ... }, ... }
    
    Return dict
    """
    avg_sentiment_dict = {}
    
    for index, row in data.iterrows():

        sender_id = row['from_id']
        dialog_id = row['dialog ID']
        subdialog_id = row['subdialog_id']
        
        if not sender_id in avg_sentiment_dict:
            avg_sentiment_dict[sender_id] = {}
        if not dialog_id in avg_sentiment_dict[sender_id]:
            avg_sentiment_dict[sender_id][dialog_id] = {}
        if not subdialog_id in avg_sentiment_dict[sender_id][dialog_id]:
            avg_sentiment_dict[sender_id][dialog_id][subdialog_id] = set()
            
        avg_sentiment_dict[sender_id][dialog_id][subdialog_id].add(row['msg_sentiment'])
    
    for sender_id, sentiment_dict in avg_sentiment_dict.items():      
        for dialog_id, subdialog_dict in sentiment_dict.items():
            for subdialog_id, sentiment_set in subdialog_dict.items():
            
                avg_sentiment = round(sum(sentiment_set) / len(sentiment_set), 3)            
                avg_sentiment_dict[sender_id][dialog_id][subdialog_id] = avg_sentiment
            
    return avg_sentiment_dict

        



In [None]:

def calculate_sentiment_by_hour_and_day(data, avg_sentiment_dict):
    """
    Calculate sentiment by day of the week and hour
    for each user.
    
    Sentiment for a particular message is the average sentiment
    for a message in a particular dialog and subdialog (data from 
    avg_sentiment_dict)
     
    Return dict with data ready for visualization
    """
    sentiment_by_time = {}
    
    for index, row in data.iterrows():
     
        msg_date_data = get_day_and_hour(row['date'])
        
        sender_id = row['from_id']
        day_of_week = msg_date_data['day']
        hour = msg_date_data['hour']
               
        dialog_id = row['dialog ID']
        subdialog_id = row['subdialog_id']
        sentiment = avg_sentiment_dict[sender_id][dialog_id][subdialog_id]
    
        if not sender_id in sentiment_by_time:
            sentiment_by_time[sender_id] = {}
        if not day_of_week in sentiment_by_time[sender_id]:
            sentiment_by_time[sender_id][day_of_week] = {hr : set() for hr in range(24)}
            
        sentiment_by_time[sender_id][day_of_week][hour].add(sentiment)
        
    for sender_id, sentiment_timeline_dict in sentiment_by_time.items():
        for day_of_week, hour_data in sentiment_timeline_dict.items():
            for hour in hour_data:
                try:
                    hour_data[hour] = round(sum(hour_data[hour]) / len(hour_data[hour]), 3)
                except ZeroDivisionError:
                    hour_data[hour] = 0
                    
    return sentiment_by_time

# This adds sentiment column to a df
sentiment_data_for_each_msg = add_dialog_sentiment(data) 

# Average sentiment for each dialog
avg_sentiment_data = calculate_avg_subdialog_sentiment(sent_data)

# Sentiment data divided by hours, ready for visualization
sentiment_data_by_hours = calculate_sentiment_by_hour_and_day(data, avg_sentiment_data)


In [None]:



def visualize_user_sentiment(sentiment_data, user_id):
    """
    Visualize sentiment data for a specific user
    in a timeline (linear graph), starting at 00:00 on Monday,
    finishing at 23:59 on Sunday
    """
    user_sent_data = sentiment_data[user_id]

    fig = make_subplots(rows=1, cols=7, 
                   subplot_titles=("Monday", "Tuesday", "Wednesday", "Thursday",
                                   "Friday", "Saturday", "Sunday"),
                   shared_yaxes=True,
                   x_title="Hours",
                   y_title="Sentiment",
                   horizontal_spacing=0.005)
    
    fig.update_layout(title_text=f"Weekly sentiment analysis for {user_id}", height=450)
    
    for day_of_week in range(1, 8):
        
        hour_data = user_sent_data[day_of_week]

        fig.add_scatter(x=list(hour_data.keys()), 
                        y=list(hour_data.values()), 
                        name=get_week_day_from_number(day_of_week), 
                        row=1, col=day_of_week)

    
    fig.show()
    

    
visualize_user_sentiment(sentiment_data_by_hours, 511986933)

In [None]:
# Don't know if that below will ever be used 

In [None]:
def is_date_after(check_date, days_ago = 365):
    """
    Check if {days_ago} date < {check_date} < current date
    """
    date_today = datetime.now(timezone.utc)
    days_ago = timedelta(days = days_ago)
    final_date = date_today - days_ago
    if check_date > final_date:
        return True
    return False


def divide_and_prepare_data(data, days_ago = 365):
    """
    Divide messages by User and Day of the week
    """
    prepped_data = {}
    for index, row in data.iterrows():
        
        msg_date = datetime.fromisoformat(row['date'])
        if not is_date_after(msg_date, days_ago):
            break
        if not row['from_id'] in prepped_data:
            prepped_data[row['from_id']] = {
                day_num : set() for day_num in range(1, 8)
            }
        
        msg_week_day = msg_date.isoweekday()
        prepped_data[row['from_id']][msg_week_day].add(row['message'])
        
    return prepped_data
    
    
def sentiment_text_analysis(data, sentiment_dict_path="dicts/tone-dict-uk.tsv"):
    """
    Calculate sentiment of each word, and store the result
    as a sentiment for a specific week day (of a specific user)
    
    P.S Messages that are not in word_dict are perceived as Neutral (sentiment = 0)
    """
    sentiment_result = {}
    text_data = divide_and_prepare_data(data)
    word_dict = tsv_to_dict(sentiment_dict_path)
    for user_id in text_data:
        
        if not user_id in sentiment_result:
            sentiment_result[user_id] = {
                day_num : 0 for day_num in range(1, 8)
            }
        
        for day_number in text_data[user_id]:
            
            score = 0
            
            while text_data[user_id][day_number]:
                
                msg = text_data[user_id][day_number].pop()

                if isinstance(msg, str):
                    tokens = msg.strip().split()
                    for token in tokens:
                        if token.lower() in word_dict:
                            score += word_dict[token.lower()]
                 
            sentiment_result[user_id][day_number] = score
            
            
    return sentiment_result
    
    
def plot_sentiment_analysis_results(result_data):
    
    days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    
    for user_id, sentiment_info in result_data.items():
            
        sentiment_values = list(sentiment_info.values())
        positions = list(sentiment_info.keys())
        
        fig = plt.figure(figsize=(10,8))
        plt.style.use("seaborn-muted")
        plt.bar(positions, sentiment_values)
        plt.ylabel('Relative sentiment of messages')
        plt.xticks(positions, days)
        plt.title(f"Sentiment for user with ID {user_id}")
        plt.show()


In [None]:
result = sentiment_text_analysis(data)
plot_sentiment_analysis_results(result)