NLP PROJECT

In [None]:
from google.colab import files
uploaded = files.upload()


Saving proverbs_2.csv to proverbs_2.csv


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load your dataset
df = pd.read_csv('proverbs_2.csv')

# Strip any leading/trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Display the first few rows
print(df.head())

# Define a simple text preprocessing function
def preprocess_text(text):
    # Convert to string and lowercase
    text = str(text).lower()
    # Remove punctuation and special characters using regex
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to 'Literal English Translation' and 'Meaning' columns
df['Literal English Translation'] = df['Literal English Translation'].apply(preprocess_text)
df['Meaning'] = df['Meaning'].apply(preprocess_text)

# Since we're not simplifying the text, assign the original literal translation to 'Simplified Literal Translation'
df['Simplified Literal Translation'] = df['Literal English Translation']

# Load the advanced sentiment analysis model
sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Download necessary files for the tokenizer and model
tokenizer_sentiment = AutoTokenizer.from_pretrained(sentiment_model_name)
model_sentiment = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name).to(device)

# Function to get sentiment score
def get_sentiment(text):
    # Encode the text
    encoded_input = tokenizer_sentiment(text, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output = model_sentiment(**encoded_input)
    scores = output.logits[0].cpu().numpy()
    # Apply softmax to get probabilities
    scores = torch.softmax(torch.tensor(scores), dim=0)
    # The model outputs scores for [Negative, Neutral, Positive]
    # Calculate compound sentiment score
    sentiment_score = scores[2].item() - scores[0].item()  # Positive score minus Negative score
    return sentiment_score

# Apply sentiment analysis to both the original literal translation and meaning
df['literal_sentiment'] = df['Literal English Translation'].apply(lambda x: get_sentiment(x))
df['meaning_sentiment'] = df['Meaning'].apply(lambda x: get_sentiment(x))

# Apply sentiment analysis to the simplified literal translation
df['simplified_literal_sentiment'] = df['Simplified Literal Translation'].apply(lambda x: get_sentiment(x))

# Print to check results
print(df[['Literal English Translation', 'literal_sentiment', 'Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment']].head())

# Threshold for irony classification
threshold = 0.05

# Calculate the sentiment difference between the simplified literal translation and the meaning
#df['sentiment_diff'] = abs(df['simplified_literal_sentiment'] - df['meaning_sentiment'])
df['sentiment_diff'] = abs(abs(df['literal_sentiment']) - abs(df['meaning_sentiment']))

# Classify irony based on sentiment difference
df['predicted_irony'] = df['sentiment_diff'].apply(lambda x: 'Ironic' if x > threshold else 'Not Ironic')

# Display the results
print(df[['Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment', 'predicted_irony']].head())

# Handling Irony column and calculating accuracy
df['Irony (Yes/No)'] = df['Irony (Yes/No)'].astype(str).str.lower().map({'yes': 1, 'no': 0})

# Drop any rows with missing values in the Irony column
df = df.dropna(subset=['Irony (Yes/No)'])

# Convert 'predicted_irony' to numeric values
df['predicted_irony'] = df['predicted_irony'].map({'Ironic': 1, 'Not Ironic': 0})

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df['Irony (Yes/No)'], df['predicted_irony'])

print(f'Accuracy: {accuracy}')

Using device: cpu
                               Proverb Language  \
0                  Δώσε τόπο στην οργή    Greek   
1  Αγάλι-αγάλι γίνεται η αγουρίδα μέλι    Greek   
2   Είπε ο γάιδαρος τον πετεινό κεφάλα    Greek   
3     Η καλύτερη άμυνα είναι η επίθεση    Greek   
4              Η φτήνια τρώει τον παρά    Greek   

                        Literal English Translation  \
0                               Give place to anger   
1  Slowly, slowly the unripe fruit turns into honey   
2           The donkey called the rooster bigheaded   
3                The best defence is a good offense   
4                          Cheapness eats the money   

                                             Meaning Irony (Yes/No)  
0  Control or suppress your anger and avoid actin...            Yes  
1  Good things take time, and with patience, some...             No  
2  Someone is criticizing another for a flaw that...            Yes  
3  Taking proactive, aggressive action is often t...            

In [None]:
from sklearn.metrics import confusion_matrix

# Assuming 'Irony (Yes/No)' is the true label and 'predicted_irony' is the prediction
conf_matrix = confusion_matrix(df['Irony (Yes/No)'], df['predicted_irony'])
print(conf_matrix)
tn, fp, fn, tp = conf_matrix.ravel()  # Extract values from confusion matrix
specificity = tn / (tn + fp)
print(f"Specificity: {specificity}")

[[ 6 36]
 [ 5 49]]
Specificity: 0.14285714285714285


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Get the classification report as a dictionary
report = classification_report(df['Irony (Yes/No)'], df['predicted_irony'], output_dict=True)

# Calculate confusion matrix
conf_matrix = confusion_matrix(df['Irony (Yes/No)'], df['predicted_irony'])
tn, fp, fn, tp = conf_matrix.ravel()

# Extract metrics
precision = report['1']['precision']  # Precision for Ironic class
recall = report['1']['recall']  # Recall for Ironic class (Sensitivity)
specificity = tn / (tn + fp)  # Specificity (True Negative Rate)
accuracy = report['accuracy']  # Overall accuracy

# Print the metrics
# precision: Out of all the proverbs the model predicted as ironic, what proportion was actually ironic?
print(f"Precision: {precision:.4f}")
#recall/sensitivity: Out of all the proverbs that were actually ironic, what proportion did the model correctly identify as ironic?
print(f"Recall (Sensitivity): {recall:.4f}")
#specifity: Out of all the proverbs that were actually not ironic, what proportion did the model correctly identify as not ironic?
print(f"Specificity: {specificity:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Print the overall report for reference
print(classification_report(df['Irony (Yes/No)'], df['predicted_irony']))

Precision: 0.5765
Recall (Sensitivity): 0.9074
Specificity: 0.1429
Accuracy: 0.5729
              precision    recall  f1-score   support

           0       0.55      0.14      0.23        42
           1       0.58      0.91      0.71        54

    accuracy                           0.57        96
   macro avg       0.56      0.53      0.47        96
weighted avg       0.56      0.57      0.50        96

