NLP PROJECT

In [None]:
from google.colab import files
uploaded = files.upload()


Saving proverbs.csv to proverbs.csv


In [None]:
import pandas as pd
import nltk
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Load the CSV file
df = pd.read_csv('proverbs.csv')
print(df.head())


                               Proverb Language  \
0                  Δώσε τόπο στην οργή    Greek   
1  Αγάλι-αγάλι γίνεται η αγουρίδα μέλι    Greek   
2   Είπε ο γάιδαρος τον πετεινό κεφάλα    Greek   
3     Η καλύτερη άμυνα είναι η επίθεση    Greek   
4              Η φτήνια τρώει τον παρά    Greek   

                        Literal English Translation  \
0                               Give place to anger   
1  Slowly, slowly the unripe fruit turns into honey   
2           The donkey called the rooster bigheaded   
3                The best defence is a good offense   
4                          Cheapness eats the money   

                                             Meaning Irony (Yes/No)  
0  Control or suppress your anger and avoid actin...            Yes  
1  Good things take time, and with patience, some...             No  
2  Someone is criticizing another for a flaw that...            Yes  
3  Taking proactive, aggressive action is often t...            Yes  
4  Cheap opt

In [None]:
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return ' '.join(tokens)

# Apply preprocessing to Literal English Translation and Meaning
df['Literal English Translation'] = df['Literal English Translation'].apply(preprocess)
df['Meaning'] = df['Meaning'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Initialize sentiment analysis pipeline using a pre-trained transformer model
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment(text):
    result = sentiment_pipeline(text)[0]
    score = result['score'] if result['label'] == 'POSITIVE' else -result['score']
    return score

# Load the pre-trained FLAN-T5 model for text simplification
simplification_model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(simplification_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(simplification_model_name)

def simplify_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_beams=4, early_stopping=True)
    simplified_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return simplified_text

# Create a new column for simplified literal translation
df['Simplified Literal Translation'] = df['Literal English Translation'].apply(simplify_text)

# Preprocess the simplified literal translation
df['Simplified Literal Translation'] = df['Simplified Literal Translation'].apply(preprocess)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:


# Apply sentiment analysis to both the original literal translation and meaning
df['literal_sentiment'] = df['Literal English Translation'].apply(lambda x: get_sentiment(x))
df['meaning_sentiment'] = df['Meaning'].apply(lambda x: get_sentiment(x))

# Apply sentiment analysis to the simplified literal translation
df['simplified_literal_sentiment'] = df['Simplified Literal Translation'].apply(lambda x: get_sentiment(x))

# Print to check results
print(df[['Literal English Translation', 'literal_sentiment', 'Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment']].head())

# Threshold for irony classification
threshold = 0.01

# Calculate the sentiment difference between the simplified literal translation and the meaning
#df['sentiment_diff'] = abs(df['simplified_literal_sentiment'] - df['meaning_sentiment'])
df['sentiment_diff'] = abs(abs(df['literal_sentiment']) - abs(df['meaning_sentiment']))

# Classify irony based on sentiment difference
df['predicted_irony'] = df['sentiment_diff'].apply(lambda x: 'Ironic' if x > threshold else 'Not Ironic')

# Display the results
print(df[['Simplified Literal Translation', 'simplified_literal_sentiment', 'Meaning', 'meaning_sentiment', 'predicted_irony']].head())

# Handling Irony column and calculating accuracy
df['Irony (Yes/No)'] = df['Irony (Yes/No)'].astype(str).str.lower().map({'yes': 1, 'no': 0})

# Drop any rows with missing values in the Irony column
df = df.dropna(subset=['Irony (Yes/No)'])

# Convert 'predicted_irony' to numeric values
df['predicted_irony'] = df['predicted_irony'].map({'Ironic': 1, 'Not Ironic': 0})

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df['Irony (Yes/No)'], df['predicted_irony'])

print(f'Accuracy: {accuracy}')


              Literal English Translation  literal_sentiment  \
0                        give place anger          -0.984260   
1  slowly slowly unripe fruit turns honey          -0.592935   
2         donkey called rooster bigheaded          -0.981749   
3               best defence good offense           0.999724   
4                    cheapness eats money          -0.998331   

  Simplified Literal Translation  simplified_literal_sentiment  \
0                anger wikipedia                     -0.998878   
1       unripe fruit turns honey                      0.983396   
2              rooster bigheaded                     -0.937200   
3  defenders defenders defenders                      0.989088   
4           cheapness eats money                     -0.998331   

                                             Meaning  meaning_sentiment  
0    control suppress anger avoid acting impulsively           0.993691  
1  good things take time patience something seems...           0.99257

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_irony'] = df['predicted_irony'].map({'Ironic': 1, 'Not Ironic': 0})
