In [1]:
import numpy as np
import pandas as pd
import re

import SubtitleProcessing

In [61]:
def prepare_data_for_manual_labelling(subtitle_files, podcast_names, output_file_name, chunk_size, min_chunk_size = 10, filter_labels=None, filter_labels_proportions=None, guess_coin=True):
    """Prepares a csv file for manual sentiment labelling.
    
    Parameters
    ----------
    subtitle_files : list
        Paths to all subtitle files to be used.
    podcast_names : list
        Names of the used podcasts. Ordering must match the subtitle_files list.
    output_file_name : str
        Name of the generated output file.
    chunk_size : int
        Word count of the chunks.
    min_chunk_size : int
        Chunks with less words will be discarded.
    filter_labels : list
        Keep text chunks auto labelled with these labels.
    filter_labels_proportions : list
        Determines how the resulting data will be proportioned based on the auto labels. If there is too much data for specific labels this data will be discarded.
    guess_coin : bool
        Already sets the 'Coin' field based on the Auto_Label
    """
    
    df = pd.DataFrame(columns=["Podcast_Title", "Start_Time", "End_Time", "Auto_Label", "Text", "Coin", "Sentiment"])
    
    for i in range(0,len(subtitle_files)):
        # Get text chunks and their corresponding start and end times
        text_chunks, chunk_start_times, chunk_end_times = SubtitleProcessing.generate_text_chunks(subtitle_files[i], chunk_size, min_chunk_size)
        
        
        # Fill the date in the dataframe
        df_new = pd.DataFrame(columns=df.columns)
        
        text_chunks = [" ".join(chunk) for chunk in text_chunks]
        df_new["Text"] = text_chunks
        
        df_new["Start_Time"] = chunk_start_times
        df_new["End_Time"] = chunk_end_times
        
        df_new["Podcast_Title"] = podcast_names[i]
        
        df_new["Auto_Label"] = [SubtitleProcessing.auto_label_text_chunk_default_labels(t) for t in text_chunks]
        
        df = df.append(df_new, ignore_index=True)
    
    # Filter and proportion data
    df = filter_and_balance_by_auto_labels(df, filter_labels, filter_labels_proportions)
    
    if guess_coin:
        df["Coin"][df["Auto_Label"].isin(["BTC","ETH","DOGE"])] = df["Auto_Label"]

    df.to_csv(output_file_name, index=False)


def filter_by_auto_labels(df, filter_labels):
    """Only keeps rows where Auto_Label is in filter_labels
    
    """
    
    return df[df["Auto_Label"].isin(filter_labels)]


def filter_and_balance_by_auto_labels(df, filter_labels, filter_labels_proportions):
    """Prepares a csv file for manual sentiment labelling.
    
    Parameters
    ----------
    df : DataFrame
        Data.
    filter_labels : list
        Keep rows auto labelled with these labels.
    filter_labels_proportions : list
        Determines how the resulting data will be proportioned based on the auto labels. If there is too much data for specific labels this data will be discarded.
    """
    
    df = filter_by_auto_labels(df, filter_labels)
    
    total_rows = len(df)
    label_least_data = filter_labels[0]  # The label with the least amount of data relative to the target proportion
    label_least_data_current_percentage = 0
    label_least_data_percentage_to_target = 100
    
    # Find the label with the least amount of data relative to its target proportion of data
    for i in range(0,len(filter_labels)):
        count = sum(df["Auto_Label"] == filter_labels[i])
        current_label_percentage = count / total_rows
        current_label_percentage_to_target = current_label_percentage / filter_labels_proportions[i]
        
        if current_label_percentage_to_target < label_least_data_percentage_to_target:
            label_least_data = i
            label_least_data_current_percentage = current_label_percentage
            label_least_data_percentage_to_target = current_label_percentage_to_target
    
    # Discard data of over represented labels
    new_total_rows = sum(df["Auto_Label"] == filter_labels[label_least_data]) / filter_labels_proportions[label_least_data]
    for i in range(0,len(filter_labels)):
        target_row_count = new_total_rows * filter_labels_proportions[i]  # Targeted row count of the current label
        current_row_count = sum(df["Auto_Label"] == filter_labels[i])
        drop_rows = df[df["Auto_Label"] == filter_labels[i]].sample(n=(current_row_count-int(target_row_count)))  # Pick random rows to drop

        df = df.drop(drop_rows.index)
        df.reset_index(drop=True, inplace=True)
    
    return df

In [62]:
folder_path = "C:/Users/Tristan/nlp_project/podcast_data/labelling/"
podcast_names = ["altcoin_daily_2021-06-01",
                "altcoin_daily_2021-06-03",
                "altcoin_daily_2021-06-04",
                "altcoin_daily_2021-06-05",
                "altcoin_daily_2021-06-06",
                "altcoin_daily_2021-06-07",
                "altcoin_daily_2021-06-08",
                "altcoin_daily_2021-06-09",
                "altcoin_daily_2021-06-10",
                "altcoin_daily_2021-06-11"]
input_files = list()
for name in podcast_names:
    input_files.append(folder_path + name + ".vtt")

prepare_data_for_manual_labelling(input_files,
                                  podcast_names,
                                  "sentiment_labels_altcoin_daily_210601_to_210611.csv",
                                  30,
                                  filter_labels=["BTC","ETH","DOGE","crypto_space", "None"],
                                  filter_labels_proportions=[.23,.23,.23,.23,.08])