In [15]:
import pandas as pd 
import re

tweets = pd.read_csv("all_tweets.csv")

for i in range(len(tweets['text'])):
    tweets.loc[i, 'text'] = re.sub(r'@\S+', '@user', tweets['text'][i])

tweets.to_csv('masked_all_tweets.csv', index=False)


In [16]:
import json
import csv

def jsonl_to_csv(jsonl_file, csv_file):
    with open(jsonl_file, 'r', encoding='utf-8') as infile, open(csv_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["speaker", "text"])
        
        for i, line in enumerate(infile):
            dialogue = json.loads(line.strip())
            if not dialogue:
                continue
            writer.writerow([f"Dialogue{i}", ""])
            speakers = {}
            j = 0
            for utterance in dialogue['conversation']:
                user = utterance['speaker']
                if user not in speakers:
                    speakers[user] = j
                    j += 1
                
                speaker = f"Speaker_{speakers[user]}"
                text = utterance['utterance']
                writer.writerow([speaker, text])

# Specify the input JSONL file and the output CSV file
jsonl_file = '500conv_global_reddit_with_index.jsonl'
csv_file = '500conv_global_reddit_with_index.csv'

# Convert JSONL to CSV
jsonl_to_csv(jsonl_file, csv_file)


In [4]:
import json
import csv
import spacy

'''
Converst jsonl to csv file. Breaks up sentences in text and join them if not exceeding token limit.
'''
def jsonl_to_csv_with_token_limit(jsonl_file, csv_file, token_limit=100):
    # Load the spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    def split_into_sentences(text):
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
        segments = []
        cur_segment = ""
        cur_len = 0
        for sentence in sentences:
            k = len(nlp(sentence)) 
            if k + cur_len <= token_limit:
                cur_segment += sentence
                cur_len += k
            else:
                cur_len = k
                segments.append(cur_segment)
                cur_segment = sentence
        

        if cur_segment:
            # cur_segment = cur_segment[:-4]
            segments.append(cur_segment)

        return segments

    num_of_rows = 0

    with open(jsonl_file, 'r', encoding='utf-8') as infile, open(csv_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["speaker", "text"])
        
        for i, line in enumerate(infile):
            dialogue = json.loads(line.strip())
            if not dialogue:
                continue
            writer.writerow([f"Dialogue{i}", ""])
            speakers = {}
            j = 0
            for utterance in dialogue['conversation']:
                user = utterance['speaker']
                if user not in speakers:
                    speakers[user] = j
                    j += 1
                
                speaker = f"Speaker_{speakers[user]}"
                text = utterance['utterance']
                segments = split_into_sentences(text)
                for segment in segments:
                    num_of_rows += 1
                    writer.writerow([speaker, segment])
                
    print("Number of rows:", num_of_rows)

# Specify the input JSONL file and the output CSV file
jsonl_file = 'qc_with_index.jsonl'
csv_file = 'qc_with_token_limit2.csv'

# Convert JSONL to CSV with token limit
jsonl_to_csv_with_token_limit(jsonl_file, csv_file)


Number of rows: 833


In [12]:
import pandas as pd
import spacy

qc = pd.read_csv("qc_with_index.csv")
nlp = spacy.load('en_core_web_sm')
TOKEN_LIMIT = 100

def is_lengthy(text):
    if not text:
        return 0
    try:
        length = len(nlp(text))
        if length > TOKEN_LIMIT:
            return 1
        return 0
    except:
        print(text)
qc['text'] = qc['text'].fillna(" ")
qc['is_lengthy'] = qc['text'].map(lambda x: is_lengthy(x))
qc.to_csv("qc_with_is_lengthy.csv", index=False)


In [10]:
qc['text'].fillna(" ")

0                                                       
1             Singapore is ruled by the WEF and not PAP.
2      In the 1980-2000, we were ruled by the PAP. No...
3      Going by OP logic, much of the world is ruled ...
4                                                       
                             ...                        
360                                                     
361    Hydrogen discussions would be more valuable if...
362    This sub seems to have "hydrogen" up and down ...
363    I only have a problem with being mind fucked b...
364    Bro did you forgot to take your pills this mor...
Name: text, Length: 365, dtype: object

In [19]:
import json
import csv
import spacy
import re

'''
Converts the jsonl file into csv file format. It first splits the text into paragarphs from using \n\n, 
then combines paragraphs if not exceeding token limit. 

Inputs: path to jsonl file and output path of csv file
output: csv file stored at the output path
returns: nothing
prints: the number of rows of text in the output csv
'''
def jsonl_to_csv_with_token_limit2(jsonl_file, csv_file, token_limit=100):
    # Load the spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    def split_into_sentences(text):
        # paragraphs = re.split('\\n\\n|\\n>\s*\\n', text)
        paragraphs = text.split("\n\n")
        segments = []
        cur_segment = ""
        cur_len = 0

        for paragraph in paragraphs:
            k = len(nlp(paragraph))
            
            if k + cur_len <= token_limit:
                cur_segment += '\n\n'
                cur_segment += paragraph
                cur_len += k
            else:
                cur_len = k
                if cur_segment:
                    segments.append(cur_segment.strip())
                cur_segment = paragraph            

        if cur_segment:
            segments.append(cur_segment.strip())

        return segments

    num_of_rows = 0

    with open(jsonl_file, 'r', encoding='utf-8') as infile, open(csv_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["speaker", "text"])
        
        for i, line in enumerate(infile):
            dialogue = json.loads(line.strip())
            if not dialogue:
                continue
            writer.writerow([f"Dialogue{i}", ""])
            speakers2id = {}
            nextId = 0
            for utterance in dialogue['conversation']:
                user = utterance['speaker']
                if user not in speakers2id:
                    speakers2id[user] = nextId
                    nextId += 1
                
                speaker = f"Speaker_{speakers2id[user]}"
                text = utterance['utterance']
                segments = split_into_sentences(text)
                for segment in segments:
                    num_of_rows += 1
                    writer.writerow([speaker, segment])
                
    print("Number of rows:", num_of_rows)

# # Specify the input JSONL file and the output CSV file
# jsonl_file = 'shuffled_CandD.jsonl'
# csv_file = 'shuffled_CandD_limit_paragraph.csv'

# # Convert JSONL to CSV with token limit
# jsonl_to_csv_with_token_limit2(jsonl_file, csv_file)


In [20]:
import json
import csv
import spacy


def split_annotated_csv_by_paragraph(input_csv, output_csv, token_limit=100):
    # Load the spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    def split_into_paragraphs(text):
        paragraphs = text.split("\n\n")
        segments = []
        cur_segment = ""
        cur_len = 0

        for paragraph in paragraphs:
            k = len(nlp(paragraph))
            
            if k + cur_len <= token_limit:
                cur_segment += '\n\n'
                cur_segment += paragraph
                cur_len += k
            else:
                cur_len = k
                if cur_segment:
                    segments.append(cur_segment.strip())
                cur_segment = paragraph            

        if cur_segment:
            segments.append(cur_segment.strip())

        return segments

    number_of_rows = 0

    with open(input_csv, 'r', encoding='utf-8') as infile, open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            # print(row)
            # if 'text' not in row:
            #     writer.writerow(row)
            #     continue
            text = row['text']
            split_texts = split_into_paragraphs(text)
            if len(split_texts) <= 1:
                if len(split_texts) == 1:
                    number_of_rows += 1
                writer.writerow(row)
            else:
                for split_text in split_texts:
                    new_row = {key: (row['speaker'] if key == 'speaker' else '') for key in row}
                    new_row['text'] = split_text
                    number_of_rows += 1
                    writer.writerow(new_row)

    print(f'Number of text rows: {number_of_rows}')
# Specify the input JSONL file and the output CSV file
jsonl_file = 'qc_dialog_annotated_before_split.csv'
csv_file = 'qc_with_token_limit_paragraph_annotated.csv'

# Convert JSONL to CSV with token limit
split_annotated_csv_by_paragraph(jsonl_file, csv_file)

Number of text rows: 865


In [4]:
import json
import spacy
import csv

# create an unique id for each conversation by concatenating filename and index in jsonl file
# given a json dialog object, returns the id in string
def dialog_to_id(dialog):
    return dialog["filename"] + dialog["index"]

nlp = spacy.load('en_core_web_sm')

# splits the text into a single paragraph by \n\n or paragraphs not exceeding token limit
def split_into_paragraphs(text, token_limit=100):
        paragraphs = text.split("\n\n")
        segments = []
        cur_segment = ""
        cur_len = 0

        for paragraph in paragraphs:
            k = len(nlp(paragraph))
            
            if k + cur_len <= token_limit:
                cur_segment += '\n\n'
                cur_segment += paragraph
                cur_len += k
            else:
                cur_len = k
                if cur_segment:
                    segments.append(cur_segment.strip())
                cur_segment = paragraph            

        if cur_segment:
            segments.append(cur_segment.strip())

        return segments


In [16]:

# set_E_ids = set()

# # select top 22 converstations from E, which contains 305 utterances each with 1 paragraph or paragraphs <= 100 tokens
# # add these convo id into the set_E_ids
# with open('qc_with_index.jsonl', 'r', encoding='utf-8') as infile:
#     for i, line in enumerate(infile):
#         dialogue = json.loads(line.strip())
#         if not dialogue:
#             continue
#         dial_id = dialog_to_id(dialogue)
#         set_E_ids.add(dial_id)

# jsonl_file = 'shuffled_CandD.jsonl'
# csv_file = 'shuffled_CandD_limit_paragraph.csv'

# # Convert JSONL to CSV with token limit
# jsonl_to_csv_with_token_limit2(jsonl_file, csv_file)

avg_conv_len = 6547 / 600
max_conv = 3000 / avg_conv_len
max_new_C_conv = max_conv / 6 + 1
max_new_D_conv = max_conv * 5 / 6 + 1
new_C_convo_size = 0
new_D_convo_size = 0
C_utterance_size = 0
D_utterance_size = 0
max_C_utterance_size = 3000 / 6
max_D_utterance_size = 3000 * 5 / 6

########################## FOR REFERENCES: ############################################
## new E will be the top 22 convo of qc_with_index which contains 305 utterances, this will be the new QC dataset
## C: 100 conv
## D: 500 conv


num_of_rows = 0
C_index = set()  # index of convo belonging to C
D_index = set()  # index of convo belonging to D
# get the index of convo in new E that belongs to each dataset
with open('qc_with_index.jsonl', 'r', encoding='utf-8') as infile:
    for i, line in enumerate(infile):
        if i >= 22:  # only selects the top 22 convo for new E
            break
        dialogue = json.loads(line.strip())
        if not dialogue:
            continue
        if 'singapore' in dialogue['filename']:
            C_index.add(i)
        else:
            D_index.add(i)
print("Set C:", C_index)
print("Set D:", D_index)
# split into 2 sets, annotation and QC
# first count the number of utterances in new E that belongs to C and D respectively
# This uses the qc_with_token_limit_paragraph (matches the annotated version) and saves the top 22 convo into qc_top_22_convo.csv
with open('qc_with_token_limit_paragraph.csv', 'r', encoding='utf-8') as infile, open('qc_top_22_convo.csv', 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    cur_dialog_num = -1
    for row in reader:
        if row['speaker'] == 'Dialogue22':  # new E only contains the top 22 convo
            break
        text = row['text']
        writer.writerow(row)
        
            
        if not text:
            cur_dialog_num += 1
            if cur_dialog_num in C_index:
                new_C_convo_size += 1
            else:
                new_D_convo_size += 1
        else:
            if cur_dialog_num in C_index:
                C_utterance_size += 1
            else:
                D_utterance_size += 1

# print("Number of utterances in new E:", num_of_rows)
print("Number of convo in E belonging to C:", new_C_convo_size)
print("Number of convo in E belonging to D:", new_D_convo_size)
print("Number of utterances from C:", C_utterance_size)
print("Number of utterances from D:", D_utterance_size)



Set C: {0, 1, 3, 7, 15}
Set D: {2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21}
Number of convo in E belonging to C: 5
Number of convo in E belonging to D: 17
Number of utterances from C: 139
Number of utterances from D: 166


In [11]:

# now pick from shuffled_CandD_without_E until we reach max conv size for each group or max utterance size
# (top 22 of shuffled_CandD makes up new E, so the above file is obtained by deleting top 22 rows of shuffled_CandD)
cur_utterance_size = 305
MAX_UTERRANCE_SIZE = 3000

C_index = set()
D_index = set()
with open('shuffled_CandD_without_E.jsonl', 'r', encoding='utf-8') as infile, open('new_CandD_energy.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["speaker", "text", "emotion", "sentiment"])
    
    for i, line in enumerate(infile):
        if cur_utterance_size >= 3000 or (new_C_convo_size >= max_new_C_conv and new_D_convo_size >= max_new_D_conv):
            break
        dialogue = json.loads(line.strip())
        if not dialogue:
            continue
        
        can_be_added = None
        if 'singapore' in dialogue['filename'] and new_C_convo_size < max_new_C_conv:
            C_index.add(i)
            new_C_convo_size += 1
            can_be_added = 'C'
        elif 'singapore' not in dialogue['filename'] and new_D_convo_size < max_D_utterance_size:
            D_index.add(i)
            new_D_convo_size += 1 
            can_be_added = 'D'
        
        if not can_be_added:
            continue

        writer.writerow([f"Dialogue{i}", ""])
        speakers2id = {}
        nextId = 0
        for utterance in dialogue['conversation']:
            user = utterance['speaker']
            if user not in speakers2id:
                speakers2id[user] = nextId
                nextId += 1
            
            speaker = f"Speaker_{speakers2id[user]}"
            text = utterance['utterance']
            segments = split_into_paragraphs(text)
            for segment in segments:
                cur_utterance_size += 1
                if can_be_added == 'C':
                    C_utterance_size += 1
                else:
                    D_utterance_size += 1
                writer.writerow([speaker, segment, "", ""])
                
print("Number of utterances:", cur_utterance_size)
print("Number of convo belonging to C:", new_C_convo_size)
print("Number of convo belonging to D:", new_D_convo_size)
print("Number of utterances from C:", C_utterance_size)
print("Number of utterances from D:", D_utterance_size)
print("C_index:", sorted(list(C_index)))
print("D_index:", sorted(list(D_index)))

Number of utterances: 3000
Number of convo belonging to C: 42
Number of convo belonging to D: 248
Number of utterances from C: 564
Number of utterances from D: 2436
C_index: [7, 9, 14, 22, 24, 38, 39, 41, 59, 62, 69, 71, 75, 77, 81, 93, 103, 114, 128, 130, 133, 144, 149, 154, 181, 182, 186, 196, 212, 222, 223, 225, 226, 238, 239, 245, 259]
D_index: [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 63, 64, 65, 66, 67, 68, 70, 72, 73, 74, 76, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 129, 131, 132, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 

In [9]:
cur_utterance_size = 0
MAX_UTERRANCE_SIZE = 3000

new_C_convo_size = 0
new_D_convo_size = 0
cur_utterance_size = 0
C_utterance_size = 0
D_utterance_size = 0
E_from_C = 0
E_from_D = 0
with open('shuffled_CandD.jsonl', 'r', encoding='utf-8') as infile, open('new_C.jsonl', 'w', encoding='utf-8') as new_C, open('new_D.jsonl', 'w', encoding='utf-8') as new_D:
    for i, line in enumerate(infile):

        if cur_utterance_size >= 3000 or (new_C_convo_size >= max_new_C_conv and new_D_convo_size >= max_new_D_conv):
            break
        dialogue = json.loads(line.strip())
        if not dialogue:
            continue
        
        add_to = None
        if 'singapore' in dialogue['filename'] and new_C_convo_size < max_new_C_conv:
            # C_index.add(i)
            new_C_convo_size += 1
            new_C.write(line)
            add_to = 'C'
        elif 'singapore' not in dialogue['filename'] and new_D_convo_size < max_D_utterance_size:
            # D_index.add(i)
            new_D_convo_size += 1 
            new_D.write(line)
            add_to = 'D'

        for utterance in dialogue['conversation']:
            text = utterance['utterance']
            segments = split_into_paragraphs(text)
            for segment in segments:
                cur_utterance_size += 1
                if add_to == 'C':
                    if i < 22:
                        E_from_C += 1
                    C_utterance_size += 1
                else:
                    if i < 22:
                        E_from_D += 1
                    D_utterance_size += 1
            
print("C convo size:", new_C_convo_size)
print("D convo size", new_D_convo_size)
print("C utterance size", C_utterance_size)
print("D utterance size", D_utterance_size)

C convo size: 43
D convo size 248
C utterance size 571
D utterance size 2434


In [22]:
import json
import random

# Combine 100 conv (C) and 500 conv (D) and shuffle to get the original pool of qc

# Paths for input and output files
input_file_path = 'drone_reddit.jsonl'
output_file_path = 'shuffled_drone_reddit.jsonl'

# Read lines from the JSONL file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Shuffle the lines
random.Random(14).shuffle(lines)

# Write the shuffled lines to a new file
with open(output_file_path, 'w') as file:
    file.writelines(lines)

print(f'Shuffled lines written to {output_file_path}')


Shuffled lines written to shuffled_drone_reddit.jsonl


In [27]:
import pandas as pd

# reverse engineer to get back the non annotated csv from the annotated csv of qc_split_by_paragraph
# this is due to minor discrepencies between the annotated dialg and the jsonl file

df = pd.read_csv('qc_with_token_limit_paragraph_annotated.csv')
df.drop(columns=['remarks','emotion-ly','sentiment-ly','remarks-ly'], inplace=True)
columns_to_clear = ['emotion', 'sentiment']

df[columns_to_clear] = ""

#remove the mistakes of empty rows in previous csv file
# Define the filter condition
condition = ~((df['speaker'].str.contains('speaker', case=False, na=False)) & (df['text'].isna() | df['text'].eq('')))

# Apply the filter to the DataFrame
cleaned_df = df[condition]

cleaned_df.to_csv('qc_with_token_limit_paragraph.csv', index=False)

In [34]:
import pandas as pd

# Read the CSV files into DataFrames
df_E = pd.read_csv('qc_top_22_convo.csv')
df_D = pd.read_csv('new_CandD_energy.csv')

def split_dialogues(df, n_chunks):
    dialogues = df[df['speaker'].str.contains('Dialogue', na=False)]
    dialogue_indices = dialogues.index.tolist()
    chunk_size = len(dialogue_indices) // n_chunks

    chunks = []
    for i in range(n_chunks):
        start_idx = dialogue_indices[i * chunk_size]
        if i == n_chunks - 1:
            end_idx = df.index[-1] + 1
        else:
            end_idx = dialogue_indices[(i + 1) * chunk_size]
        chunks.append(df[start_idx:end_idx])
    
    return chunks

dialogue_counter = 0

def relabel_dialogues(df):
    global dialogue_counter
    for idx, row in df.iterrows():
        if 'Dialogue' in str(row['speaker']):
            df.at[idx, 'speaker'] = f'Dialogue{dialogue_counter}'
            dialogue_counter += 1
    return df

# Split the dialogues in both DataFrames
chunks_E = split_dialogues(df_E, 5)
chunks_D = split_dialogues(df_D, 5)

# Combine the chunks from both DataFrames
combined_chunks = [pd.concat([chunks_E[i], chunks_D[i]], ignore_index=True) for i in range(5)]

# Relabel the dialogues in each chunk
combined_chunks = [relabel_dialogues(chunk) for chunk in combined_chunks]

# Save the datasets to new CSV files
for i, dataset in enumerate(combined_chunks):
    dataset.to_csv(f'energy{i}.csv', index=False)
    print(f'Dataset {i} saved to energy{i}.csv')


Dataset 0 saved to energy0.csv
Dataset 1 saved to energy1.csv
Dataset 2 saved to energy2.csv
Dataset 3 saved to energy3.csv
Dataset 4 saved to energy4.csv


In [23]:
jsonl_to_csv_with_token_limit2('shuffled_drone_reddit.jsonl', 'shuffled_drone_reddit.csv')

Number of rows: 721


In [7]:
import json
with open("data/drone/shuffled_drone_reddit.jsonl", "r") as infile:
    counter = 1
    convo_from_A = 0
    convo_from_B = 0
    utterance_from_A = 0
    utterance_from_B = 0
    for line in infile:
        if counter > 81:
            break
        counter += 1
        dialogue = json.loads(line)
        if "singapore" in dialogue["filename"]:
            convo_from_A += 1
        else:
            convo_from_B += 1

        for utterance in dialogue['conversation']:
            text = utterance['utterance']
            segments = split_into_paragraphs(text)
            if "singapore" in dialogue["filename"]:
                utterance_from_A += len(segments)
            else:
                utterance_from_B += len(segments)

    print("Number of conv from A:", convo_from_A)
    print("Number of convo from B:", convo_from_B)
    print("Number of utterances from A:", utterance_from_A)
    print("Number of utterances from B:", utterance_from_B)

Number of conv from A: 14
Number of convo from B: 64
Number of utterances from A: 166
Number of utterances from B: 336


In [3]:
from datasets import load_dataset

ds = load_dataset("cardiffnlp/tweet_eval", "emotion")
ds['train'].to_csv("data/tweet_eval/emotion/train.csv")
ds['validation'].to_csv("data/tweet_eval/emotion/validation.csv")
ds['test'].to_csv("data/tweet_eval/emotion/test.csv")
ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
ds['train'].to_csv("data/tweet_eval/sentiment/train.csv")
ds['validation'].to_csv("data/tweet_eval/sentiment/validation.csv")
ds['test'].to_csv("data/tweet_eval/sentiment/test.csv")

Creating CSV from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 206.20ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 351.28ba/s]
Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 241.13ba/s]
Downloading data: 100%|██████████| 3.78M/3.78M [00:08<00:00, 431kB/s]
Downloading data: 100%|██████████| 901k/901k [00:01<00:00, 790kB/s]
Downloading data: 100%|██████████| 167k/167k [00:01<00:00, 148kB/s]
Generating train split: 100%|██████████| 45615/45615 [00:00<00:00, 822022.10 examples/s]
Generating test split: 100%|██████████| 12284/12284 [00:00<00:00, 881817.46 examples/s]
Generating validation split: 100%|██████████| 2000/2000 [00:00<00:00, 508338.87 examples/s]
Creating CSV from Arrow format: 100%|██████████| 46/46 [00:00<00:00, 324.08ba/s]
Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 394.35ba/s]
Creating CSV from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 416.87ba/s]


1177979

In [16]:
from datasets import load_dataset

int_to_emotion = ["anger", "joy", "optimism", "sadness"]
emotion_to_int = {"anger": 0, "joy": 1, "optimism": 2, "sadness": 3}
int_to_sentiment = ["negative", "neutral", "positive"]
sentiemnt_to_int = {"negative": 0, "neutral": 1, "positive": 2}
ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")

In [17]:

def process_label(example):
    example['str_label'] = int_to_emotion[example['label']]
    return example

ds = ds.map(process_label, batched=False)
ds = ds.remove_columns(["label"])
ds = ds.rename_column("str_label", "label")
ds['train'].to_csv("data/tweet_eval/sentiment/train.csv")
ds['validation'].to_csv("data/tweet_eval/sentiment/validation.csv")
ds['test'].to_csv("data/tweet_eval/sentiment/test.csv")

Map: 100%|██████████| 45615/45615 [00:00<00:00, 48488.98 examples/s]
Map: 100%|██████████| 12284/12284 [00:00<00:00, 38582.96 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 48011.45 examples/s]
Creating CSV from Arrow format: 100%|██████████| 46/46 [00:00<00:00, 420.31ba/s]
Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 359.01ba/s]
Creating CSV from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 410.74ba/s]


1222366

In [14]:
from datasets import load_dataset

load_dataset('csv', data_files="data/drone/responses/all_tweets_full_responses.csv", split="train")

Dataset({
    features: ['text', 'language', 'emotion_sw', 'sentiment', 'overall_sentiment_sw', 'emotion_sn', 'sentiment.1', 'overall_sentiment_sn', 'emotion_do', 'sentiment.2', 'overall_sentiment_do', 'Inter-annotator Emotion Agreement', 'Inter-annotator Sentiment Agreement', 'Fully Agreed Emotion', 'Fully Agreed Sentiment', 'Unnamed: 15', 'voted_emotion', 'voted_sentiment', 'Manually Labelled emotion:', 'Manually labelled sentiment'],
    num_rows: 2501
})

In [3]:
import csv
data_file = "data/drone/responses/all_tweets_full_responses.csv"
with open(data_file, 'r', newline='') as infile:
    csv_reader = csv.DictReader(infile)
    rows = list(csv_reader)
    print(rows[0])

{'text': 'CAAM will continue to provide its full support to the rapidly growing drone technology in the country. We are happy to join the group and witnessed the first drone delivery demonstration held at Cyberjaya today.  #CAAM  #TeamCAAM  #CAAMalaysia  #UrbanDroneDeliverySanboxProject  [URL]', 'language': 'en', 'emotion_sw': 'happiness', 'sentiment': '', 'overall_sentiment_sw': 'positive', 'emotion_sn': 'other', 'overall_sentiment_sn': 'neutral', 'emotion_do': 'happiness', 'overall_sentiment_do': 'positive', 'Inter-annotator Emotion Agreement': 'TRUE', 'Inter-annotator Sentiment Agreement': 'TRUE', 'Fully Agreed Emotion': 'FALSE', 'Fully Agreed Sentiment': 'FALSE', '': '', 'voted_emotion': 'happiness', 'voted_sentiment': 'positive', 'Manually Labelled emotion:': '', 'Manually labelled sentiment': ''}
