NLP PROJECT

In [1]:
from google.colab import files
uploaded = files.upload()


Saving proverbs.csv to proverbs.csv


In [2]:
import pandas as pd
import nltk
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [10]:
# Load the CSV file
df = pd.read_csv('proverbs.csv')
print(df.head())


                               Proverb Language  \
0                  Δώσε τόπο στην οργή    Greek   
1  Αγάλι-αγάλι γίνεται η αγουρίδα μέλι    Greek   
2   Είπε ο γάιδαρος τον πετεινό κεφάλα    Greek   
3     Η καλύτερη άμυνα είναι η επίθεση    Greek   
4              Η φτήνια τρώει τον παρά    Greek   

                        Literal English Translation  \
0                               Give place to anger   
1  Slowly, slowly the unripe fruit turns into honey   
2           The donkey called the rooster bigheaded   
3                The best defence is a good offense   
4                          Cheapness eats the money   

                                             Meaning Irony (Yes/No)  
0  Control or suppress your anger and avoid actin...            Yes  
1  Good things take time, and with patience, some...             No  
2  Someone is criticizing another for a flaw that...            Yes  
3  Taking proactive, aggressive action is often t...            Yes  
4  Cheap opt

In [11]:
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return ' '.join(tokens)

# Apply preprocessing to Literal English Translation and Meaning
df['Literal English Translation'] = df['Literal English Translation'].apply(preprocess)
df['Meaning'] = df['Meaning'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
# Initialize sentiment analysis pipeline using a pre-trained transformer model
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment(text):
    result = sentiment_pipeline(text)[0]
    score = result['score'] if result['label'] == 'POSITIVE' else -result['score']
    return score

# Load the pre-trained FLAN-T5 model for text simplification
simplification_model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(simplification_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(simplification_model_name)

def simplify_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_beams=4, early_stopping=True)
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return simplified_text

# Create a new column for simplified literal translation
df['Simplified Literal Translation'] = df['Literal English Translation'].apply(simplify_text)

# Preprocess the simplified literal translation
df['Simplified Literal Translation'] = df['Simplified Literal Translation'].apply(preprocess)


In [13]:


# Apply sentiment analysis to both the original literal translation and meaning
df['literal_sentiment'] = df['Literal English Translation'].apply(lambda x: get_sentiment(x))
df['meaning_sentiment'] = df['Meaning'].apply(lambda x: get_sentiment(x))

# Apply sentiment analysis to the simplified literal translation
df['simplified_literal_sentiment'] = df['Simplified Literal Translation'].apply(lambda x: get_sentiment(x))

# Print to check results
print(df[['Literal English Translation', 'literal_sentiment', 'Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment']].head())

# Threshold for irony classification
threshold = 0.01

# Calculate the sentiment difference between the simplified literal translation and the meaning
df['sentiment_diff'] = abs(df['simplified_literal_sentiment'] - df['meaning_sentiment'])
#df['sentiment_diff'] = abs(abs(df['literal_sentiment']) - abs(df['meaning_sentiment']))

# Classify irony based on sentiment difference
df['predicted_irony'] = df['sentiment_diff'].apply(lambda x: 'Ironic' if x > threshold else 'Not Ironic')

# Display the results
print(df[['Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment', 'predicted_irony']].head())

# Handling Irony column and calculating accuracy
df['Irony (Yes/No)'] = df['Irony (Yes/No)'].astype(str).str.lower().map({'yes': 1, 'no': 0})

# Drop any rows with missing values in the Irony column
df = df.dropna(subset=['Irony (Yes/No)'])

# Convert 'predicted_irony' to numeric values
df['predicted_irony'] = df['predicted_irony'].map({'Ironic': 1, 'Not Ironic': 0})

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df['Irony (Yes/No)'], df['predicted_irony'])

print(f'Accuracy: {accuracy}')


              Literal English Translation  literal_sentiment  \
0                        give place anger          -0.984260   
1  slowly slowly unripe fruit turns honey          -0.592936   
2         donkey called rooster bigheaded          -0.981749   
3               best defence good offense           0.999724   
4                    cheapness eats money          -0.998331   

  Simplified Literal Translation  simplified_literal_sentiment  \
0                anger wikipedia                     -0.998878   
1       unripe fruit turns honey                      0.983396   
2              rooster bigheaded                     -0.937200   
3  defenders defenders defenders                      0.989087   
4           cheapness eats money                     -0.998331   

                                             Meaning  meaning_sentiment  
0    control suppress anger avoid acting impulsively           0.993691  
1  good things take time patience something seems...           0.99257

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Get the classification report as a dictionary
report = classification_report(df['Irony (Yes/No)'], df['predicted_irony'], output_dict=True)

# Calculate confusion matrix
conf_matrix = confusion_matrix(df['Irony (Yes/No)'], df['predicted_irony'])
tn, fp, fn, tp = conf_matrix.ravel()

# Extract metrics
precision = report['1']['precision']  # Precision for Ironic class
recall = report['1']['recall']  # Recall for Ironic class (Sensitivity)
specificity = tn / (tn + fp)  # Specificity (True Negative Rate)
accuracy = report['accuracy']  # Overall accuracy

# Print the metrics
# precision: Out of all the proverbs the model predicted as ironic, what proportion was actually ironic?
print(f"Precision: {precision:.4f}")
#recall/sensitivity: Out of all the proverbs that were actually ironic, what proportion did the model correctly identify as ironic?
print(f"Recall (Sensitivity): {recall:.4f}")
#specifity: Out of all the proverbs that were actually not ironic, what proportion did the model correctly identify as not ironic?
print(f"Specificity: {specificity:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Print the overall report for reference
print(classification_report(df['Irony (Yes/No)'], df['predicted_irony']))

Precision: 0.5844
Recall (Sensitivity): 0.8333
Specificity: 0.2381
Accuracy: 0.5729
              precision    recall  f1-score   support

           0       0.53      0.24      0.33        42
           1       0.58      0.83      0.69        54

    accuracy                           0.57        96
   macro avg       0.56      0.54      0.51        96
weighted avg       0.56      0.57      0.53        96

