Selecting random texts

In [None]:
import pandas as pd

# Load both CSV files
file_path_2021 = '/content/hatetoxic_2021.csv'
df_2021 = pd.read_csv(file_path_2021)

# Filter the dataframe for texts from January 2020
january_2020_texts = df_2021.loc[(df_2021['year'] == 2020) & (df_2021['month'] == 1), 'text without punctuation and stopword']

# Convert the filtered texts to a DataFrame
january_2020_df = january_2020_texts.to_frame()

# Randomly select 30 texts
sampled_texts = january_2020_df.sample(n=30, random_state=42)  # `random_state` ensures reproducibility

# Save the randomly selected texts to a new Excel sheet
output_file_path = '/content/january_2020_sample.xlsx'
sampled_texts.to_excel(output_file_path, index=False)

print(f"30 random texts from January 2020 have been saved to {output_file_path}")


30 random texts from January 2020 have been saved to /content/january_2020_sample.xlsx


Threshold value optimisation

In [None]:
candidate_labels = [
    "inappropriate behaviour",
    "covid-19 outbreak in singapore",
    "singlish phrases",
    "chinese-speaking foreigners in singapore",
    "political polarization in singapore",
    "online harassment",
    "poor hygiene in public toilets",
    "salary dissatisfaction",
    "national service",
    "mask enforcement",
    "others"
]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize embedding model for similarity comparison
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Generate embeddings for candidate and true labels
candidate_label_embeddings = embedding_model.encode(candidate_labels)

# Define similarity threshold for semantic match
similarity_threshold = 0.3

# Loop through each threshold and calculate F1-score based on semantic similarity
for threshold in thresholds:
    pred_labels = []
    true_labels = df["True Label"].tolist()

    for text in df["text without punctuation and stopword"]:
        # Get classification scores
        result = classifier(text, candidate_labels)

        # Filter for scores above the threshold
        top_predictions = [
            candidate_labels[i]
            for i, score in enumerate(result["scores"]) if score >= threshold
        ]

        if top_predictions:
            # Use the first top prediction if multiple predictions pass the threshold
            predicted_label = top_predictions[0]
        else:
            predicted_label = "others"

        pred_labels.append(predicted_label)

    # Convert predicted and true labels to embeddings for semantic similarity
    pred_embeddings = embedding_model.encode(pred_labels)
    true_embeddings = embedding_model.encode(true_labels)

    # Calculate cosine similarity for each prediction and true label pair
    semantic_matches = [
        1 if cosine_similarity([pred_embeddings[i]], [true_embeddings[i]])[0][0] >= similarity_threshold else 0
        for i in range(len(pred_labels))
    ]

    # Calculate F1-score based on semantic matches
    f1 = f1_score([1]*len(true_labels), semantic_matches)  # Compare all true positives in terms of semantic similarity

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best threshold: {best_threshold}, Best F1-score: {best_f1:.4f}")




Best threshold: 0.1, Best F1-score: 0.2857
