In [1]:
import numpy as np
import pandas as pd
import re

import SubtitleProcessing

In [4]:
def prepare_data_for_manual_labelling(subtitle_files, podcast_names, output_file_name, chunk_size, min_chunk_size = 10):
    """Prepares a csv file for manual sentiment labelling.
    
    Parameters
    ----------
    subtitle_files : list
        Paths to all subtitle files to be used.
    podcast_names : list
        Names of the used podcasts. Ordering must match the subtitle_files list.
    output_file_name : str
        Name of the generated output file.
    chunk_size : int
        Word count of the chunks.
    min_chunk_size : int
        Chunks with less words will be discarded.
    """
    
    df = pd.DataFrame(columns=["Podcast_Title", "Start_Time", "End_Time", "Text", "Coin", "Sentiment"])
    
    for i in range(0,len(subtitle_files)):
        text_chunks, chunk_start_times, chunk_end_times = SubtitleProcessing.generate_text_chunks(subtitle_files[i], chunk_size, min_chunk_size)
        
        df_new = pd.DataFrame(columns=df.columns)
        
        text_chunks = [" ".join(chunk) for chunk in text_chunks]
        df_new["Text"] = text_chunks
        
        df_new["Start_Time"] = chunk_start_times
        df_new["End_Time"] = chunk_end_times
        
        df_new["Podcast_Title"] = podcast_names[i]
        
        df = df.append(df_new, ignore_index=True)
    
    df.to_csv(output_file_name, index=False)
            

In [None]:
folder_path = "C:/Users/Tristan/nlp_project/podcast_data/labelling/"
podcast_names = ["altcoin_daily_2021-06-01",
                "altcoin_daily_2021-06-03",
                "altcoin_daily_2021-06-04",
                "altcoin_daily_2021-06-05",
                "altcoin_daily_2021-06-06",
                "altcoin_daily_2021-06-07",
                "altcoin_daily_2021-06-08",
                "altcoin_daily_2021-06-09",
                "altcoin_daily_2021-06-10",
                "altcoin_daily_2021-06-11"]
input_files = list()
for name in podcast_names:
    input_files.append(folder_path + name + ".vtt")

prepare_data_for_manual_labelling(input_files, podcast_names, "sentiment_labels_altcoin_daily_210601_to_210611.csv", 30)