# Data Cleaning
This notebook formats the generated data and calculates the statistics of the dataset

In [16]:
import json
import re
import emoji
import glob
import pandas as pd
import nltk
import random
import os

In [None]:
# This function is taken from: https://github.com/cltl-students/ICF-activities-classifier

def clean_conversations_format(conversations):
    
    new_conversations = []
    regex = " {2,}" # remove multiple space
    regex_2 = "\n {1,}\n" # remove irregular \n
    regex_3 = "\n {1,}"
    regex_4 = "[$/<>\"+\\\)(}{]" # remove the simble
    regex_5 = "\n{2,}" # change \n\n to \n and use \n as separator
    regex_6 = "\t{1,}"
    for c in conversations:
        multiple_s = re.findall(regex, c)
        c = re.sub(regex," ", c) # change to one space
        c = re.sub(regex_2, "\n\n",c) #
        c = re.sub(regex_3," ", c) # change to one space
        c = re.sub(regex_4, "", c)
        c = re.sub(regex_5, "\n",c)
        c = re.sub(regex_6, "", c)
        emoji.replace_emoji(c)
        c = emoji.replace_emoji(c)

        new_conversations.append(c)

    return new_conversations


In [18]:
# Remove role labels and gibberish text

def remove_role_and_gibberish_text(new_conversations):
    '''
    This function removes role labels and gibberish text from the conversations.
    '''
    
    list_con_utterances = [] 
    count_utterances = 0

    for con in new_conversations:
        utterances = con.split("\n")
        remove_head_u = []
        skip_conversation = False

        for u in utterances:
            # Check if any sentence length exceeds 20
            if len(u.split()) >= 20:
                skip_conversation = True
                break

            # Check if any word in the sentence exceeds 20 characters
            words = u.split()
            if any(len(word) >= 20 for word in words):
                skip_conversation = True
                break

            # Remove role labels
            if u.startswith("F: "):
                u = u.replace("F: ", "")
            elif u.startswith("F:"):
                u = u.replace("F:", "")
            elif u.startswith("P: "):
                u = u.replace("P: ", "")
            elif u.startswith("P:"):
                u = u.replace("P:", "")
            elif u.startswith("Friend: "):
                u = u.replace("Friend: ", "")
            elif u.startswith("Patient:"):
                u = u.replace("Patient:", "")
            elif u.startswith(" "):
                u = u.replace(" ", "")

            if len(u) > 1:
                remove_head_u.append(u)
        
        if skip_conversation:
            continue

        list_con_utterances.append(remove_head_u)
        count_utterances += len(remove_head_u)
        list_con_utterances.append(["-"*20])

    list_u_sep = []
    for l in list_con_utterances:
        for s in l:
            list_u_sep.append(s)
    
    return list_u_sep, count_utterances


In [19]:
if not os.path.exists('./response_data/clean_conversations'):
    os.makedirs('./response_data/clean_conversations')

dir = './response_data/'
count_all_utterances = 0 
n_file = 0

# Get all the files in the raw_conversations folder
files = glob.glob(f"{dir}raw_conversations/*.json")

# Initiate a dictionary to store the number of conversations under each category
file_conversation_counts = {}

# Iterate through all the files
for f in files:
    n_file += 1  # count the number of files
    with open(f) as infile:
        data = json.load(infile)

    # Create a dict to store clean text
    cleandata = dict()
    for key, value in data.items():
        category = key
        conversations = value

        # Clean the conversations
        new_conversations = clean_conversations_format(conversations)  # list of strings
        
        # Remove roles from the conversations
        list_u_sep, count_utterances = remove_role_and_gibberish_text(new_conversations)
        
        cleandata[category] = new_conversations

        # wirte to tsv file
        df = pd.DataFrame({'col':list_u_sep})
        df.to_csv(f"{dir}clean_conversations/{category}_clean.tsv",sep="\t",index=False,header=["utterances"])
        
        # write to json file
        with open(f"{dir}clean_conversations/{category}_clean.json", "w", encoding="utf-8") as jfile:
            json.dump(cleandata, jfile, ensure_ascii=False, indent=4)
        num_c = len(new_conversations)


        # Count the number of conversations under each category
        count_conversation = 1
        count_utterence = 1
        for utterance in list_u_sep:
            if utterance == "--------------------":
                count_conversation += 1
                continue
            count_utterence += 1

        # Add the count result to the dictionary
        file_conversation_counts[category] = (count_conversation,count_utterence)


# Print results
for category, (count_conversation, count_utterance) in file_conversation_counts.items():
    print(f"Category: {category}, Conversations: {count_conversation}, Utterances: {count_utterance}")

print()

# print(f"finished {n_file} files, total: {sum(file_conversation_counts.values())} conversations, {sum(file_conversation_counts.values())} utterances, ")


print(f"finished {n_file} files, total: {sum([count[0] for count in file_conversation_counts.values()])} conversations, {sum([count[1] for count in file_conversation_counts.values()])} utterances")


Category: mobility, Conversations: 241, Utterances: 2049
Category: domestic life, Conversations: 219, Utterances: 1921
Category: self-care, Conversations: 335, Utterances: 2944

finished 3 files, total: 795 conversations, 6914 utterances


In [20]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
count_token = 0
files_tokens = glob.glob("./response_data/clean_conversations/*.tsv")
for f_token in files_tokens:
    df_token = pd.read_csv(f"{f_token}", sep="\t", encoding="utf-8", on_bad_lines='skip')
    for u in df_token["utterances"]:
        tokens = tokenizer.tokenize(u)
        count_t = len(tokens)
        count_token += count_t
print("totoal tokens of all conversations are", count_token)

totoal tokens of all conversations are 77072


In [21]:
def count_conversations(path_input):
    count = 0
    with open(path_input, "r") as f:
        
        lines = f.readlines()
        # print(lines[:6])
        for line in lines:
            if line.startswith('-'):
                count += 1

        count_lines = len(lines)
    print("total lines:", count_lines)
    print("total conversations :", count)

In [22]:
mobility = "./response_data/raw_conversations/mobility.txt"
selfcare = "./response_data/raw_conversations/self-care.txt"
domestic = "./response_data/raw_conversations/domestic life.txt"

In [23]:
count_conversations(mobility)

total lines: 2765
total conversations : 280


In [24]:
count_conversations(selfcare)

total lines: 4312
total conversations : 420


In [25]:
count_conversations(domestic)

total lines: 2413
total conversations : 240
