In [None]:
import pandas as pd
from sentiment_analyzer import analyze_sentiment_textblob, analyze_sentiment_vader,analyze_sentiment_flair

In [2]:
data = pd.read_csv('a3_train_final.tsv', sep='\t', header=None)
data.columns = ['annotations', 'comment']
data.head()


Unnamed: 0,annotations,comment
0,1/1,I'll only consume if I know what's inside it....
1,0/-1,It is easier to fool a million people than it...
2,0/0,NATURAL IMMUNITY protected us since evolutio...
3,0/-1,NATURAL IMMUNITY protected us since evolutio...
4,0/0,"Proud to have resisted. Proud of my husband, ..."


In [3]:
import numpy as np
import krippendorff

# Parse original data into a 2D list and convert to floats
data_list = [[float(n) for n in item.split('/')] for item in data.annotations.values]

# Find the maximum number of annotators
max_len = max(len(item) for item in data_list)

# Fill lists shorter than the maximum length with np.nan to match the maximum length
data_filled = [item + [np.nan]*(max_len-len(item)) for item in data_list]

# Convert the data into a numpy array
data_np = np.array(data_filled, dtype=float)

# Calculate Krippendorff's alpha
alpha = krippendorff.alpha(data_np, level_of_measurement='interval')

print("Krippendorff's alpha:", alpha)


Krippendorff's alpha: 0.015974378717868


**Question:** How much consensus is there between annotators of the dataset? Do you think the data is reliable?

In [4]:
def process_annotations(s):
    '''
    Principles of Preprocessing Annotations:
        1. Single Annotator Reliability: Comments annotated by only a single individual are 
            considered unreliable. In such cases, a value of -1 is returned, signifying unreliability.

        2. Consensus on Annotation: When a comment is annotated by multiple individuals and 
            all annotations agree (i.e., the annotation numbers are identical), the comment is 
            deemed reliable. The consensus number, which may be either 1 or 0, is returned to 
            indicate the comment's true sentiment.

        3. Handling Disagreements:
            3.1 Majority Rule: For comments annotated by multiple individuals where annotations 
                do not all agree, if over 50% of the annotations share the same label, that 
                majority label is considered reliable and returned.
            3.2 Lack of Majority: If no single label accounts for more than 50% of the annotations, 
                the comment is deemed unreliable, and -1 is returned.
        
        4. Default Unreliability: In any situation not covered by the above rules, the comment 
            is considered unreliable and assigned a value of -1.

        5. Management of Unreliable Comments: Comments deemed unreliable should either be 
            re-annotated or discarded, as their ambiguity or lack of consensus undermines 
            the integrity of the dataset.
    '''
    # if the comment is annotated by only one person, it is not reliable
    if len(s) == 1:
        return -1
    
    # Split the string by slashes and convert to integers
    numbers = list(map(int, s.split('/')))
    
    # If all numbers are the same, return that number
    if all(n == numbers[0] for n in numbers):
        return numbers[0]
    
    # Calculate the frequency of each number
    freq = {n: numbers.count(n) for n in set(numbers)}
    
    # If there's a number with frequency more than 50%, return it
    most_common_number = max(freq.items(), key=lambda x: x[1])[0]
    if freq[most_common_number] > len(numbers) / 2:
        return most_common_number
    
    # Otherwise, return -1
    return -1

In [5]:
# Compute the sentiment composition
data["sentiment"] = data.iloc[:,0].apply(lambda s: process_annotations(s))
sentiment_composition = data.sentiment.value_counts().to_dict()
sentiment_composition


{1: 20878, 0: 20038, -1: 9152}

To quantify the level of agreement among annotators, we calculated the 'consensus proportion' within our dataset. This metric reflects the percentage of comments for which annotators reached a unanimous or majority agreement on the sentiment classification. A high 'consensus proportion' indicates a strong agreement among the annotators, suggesting that the dataset is reliable for further analysis. Conversely, a low 'consensus proportion' may indicate discrepancies in annotation, highlighting areas where the data might require reannotation or more careful review to ensure reliability.

In [6]:
consensus_proportion = 1 - sentiment_composition[-1] / sum(sentiment_composition.values())
print(f'The consensus proportion of this annotated data set is {consensus_proportion}.')


The consensus proportion of this annotated data set is 0.8172085963090198.


The consensus proportion of this annotated data set is 81.72%, which indicates that this data is reliable to a certain extent, although this ratio is not dramatically high.


In adherence to the principle of retaining as much data as possible for training rather than simply discarding it, we will employ `TextBlob`, `vaderSentiment` and `Flair` to simulate three anonymous annotators further annotating those data deemed unreliable. Subsequently, we will reapply the Principles of Preprocessing Annotations. Should a portion of the results emerge as reliable, we intend to incorporate this subset into our training dataset. Conversely, if some data still render as unreliable upon reevaluation, we will proceed to discard this subset. This strategy aims to maximize the utility and size of our training dataset, enhancing the robustness and accuracy of our analysis.

In [8]:
# Filter the DataFrame to only include rows where the sentiment is equal to -1 and create a copy to avoid SettingWithCopyWarning
filtered_df = data[data["sentiment"] == -1].copy()

# Apply the sentiment analysis function to the 'comment' column of the filtered DataFrame
filtered_df['sentiment_1'] = filtered_df['comment'].apply(analyze_sentiment_textblob)
filtered_df['sentiment_2'] = filtered_df['comment'].apply(analyze_sentiment_vader)
filtered_df['sentiment_3'] = analyze_sentiment_flair(filtered_df['comment'].tolist())

# Concatenate the sentiment result with the 'annotations' column
filtered_df['annotations'] = filtered_df.apply(lambda row: f"{row['annotations']}/{row['sentiment_1']}/{row['sentiment_2']}/{row['sentiment_3']}", axis=1)

# Compute the final sentiment label
filtered_df["sentiment"] = filtered_df['annotations'].apply(process_annotations)

# Update the data DataFrame, replace the data.sentiment where its value is equal to -1 with a second round annotated label
data.loc[filtered_df.index, "sentiment"] = filtered_df["sentiment"]
data.loc[filtered_df.index, "annotations"] = filtered_df["annotations"]

# Recompute the sentiment composition and consensus proportion
sentiment_composition = data.sentiment.value_counts().to_dict()
print(f'The new sentiment composition is {sentiment_composition}')
consensus_proportion = 1 - sentiment_composition[-1] / sum(sentiment_composition.values())
print(f'The consensus proportion of this annotated data set is {consensus_proportion}.')

# Disgard the unreliable data where the sentiment label is equal to -1
new_data = data[data.sentiment != -1].iloc[:,1:]
new_data = new_data[['sentiment', 'comment']]

# Save the new data to a .csv file
# new_data.to_csv('a3_train.csv', index=False, header=False)


The new sentiment composition is {0: 24744, 1: 23766, -1: 1558}
The consensus proportion of this annotated data set is 0.9688823200447392.


In [9]:
test_data = pd.read_csv('a3_test.tsv', sep='\t', header=None)
test_data.shape


(2039, 2)

In [12]:
import numpy as np
import krippendorff

# Parse original data into a 2D list and convert to floats
data_list = [[float(n) for n in item.split('/')] for item in data.annotations.values]

# Find the maximum number of annotators
max_len = max(len(item) for item in data_list)

# Fill lists shorter than the maximum length with np.nan to match the maximum length
data_filled = [item + [np.nan]*(max_len-len(item)) for item in data_list]

# Convert the data into a numpy array
data_np = np.array(data_filled, dtype=float)

# Calculate Krippendorff's alpha
alpha = krippendorff.alpha(data_np, level_of_measurement='interval')

print("Krippendorff's alpha:", alpha)

Krippendorff's alpha: 0.015065998008823689
