## EDA + basic nlp

In [None]:
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from collections import Counter

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

# Load SpaCy model - medium model with word vectors
nlp = spacy.load("en_core_web_md")

In [None]:
def analyze_sentiment(text):
  '''
    Analyzes overall sentiment, sentiment per line, and changes in sentiment of the poem
  '''
  sia = SentimentIntensityAnalyzer()

  #Overall sentiment
  overall_sentiment = sia.polarity_scores(text)

  #Sentiment by line
  line_sentiment = list()
  lines = text.split('\n')

  for i, line in enumerate(lines):
    sentiment = sia.polarity_scores(line)
    line_sentiment.append({'linenum': i+1,
                           'text': line,
                           'sentiment': sentiment})

  #Changes in sentiment
  sentiment_shift = list()
  prev = None
  for i,data in enumerate(line_sentiment):
    current = data['sentiment']['compound']
    if prev is not None:
      shift = current - prev
      if abs(shift) > 0.3:
        sentiment_shift.append({'from line': i,
                                'to line': i+1,
                                'shift': shift})
    prev = current


  return {'overall': overall_sentiment,
          'by line': line_sentiment,
          'sentiment shift': sentiment_shift,
          'primary sentiment': "positive" if overall_sentiment["compound"] > 0.05
                          else "negative" if overall_sentiment["compound"] < -0.05
                          else "neutral"}

In [None]:
def get_entities(text):
  '''
    Gets entities from text via spacy
  '''

  poem = nlp(text)
  entities = [(ent.text, ent.label_) for ent in poem.ents]
  return entities

In [None]:
def analyze_df(df, poem='strip'):
  '''
    Analyzes a df of poems

    inputs:
      df: dataframe
      poem: column name of the poem

    returns:
      df: dataframe with additional columns for analysis
  '''

  results = list() #store results

  for ndx, row in df.iterrows():
    pass

  return print()

## Transformer model test

In [None]:
#Import packages for implementing the Hugging Face Transformer Model
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
class PoemDataset(Dataset):
    def __init__(self, poems, labels, tokenizer, max_length=512):
        """
        Custom dataset for poem meaning extraction

        Args:
            poems (list): List of poems (each poem is a list of lines)
            labels (list): Corresponding labels for poems
            tokenizer: Hugging Face tokenizer
            max_length (int): Maximum sequence length for tokenization
        """
        self.poems = poems
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.poems)

    def __getitem__(self, idx):
        # Convert poem lines to a single string
        poem_text = ' '.join(self.poems[idx])

        # Tokenize the poem
        encoding = self.tokenizer(
            poem_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
def extract_poem_meanings(df):
    """
    Main function to fine-tune a transformer model for poem meaning extraction

    Args:
        df (pd.DataFrame): DataFrame containing poems as lists of lines

    Returns:
        Trained model and tokenizer for inference
    """
    # Unsupervised approach for creating pseudo-labels
    def generate_pseudo_labels(poems):
        """
        Generate pseudo-labels using unsupervised techniques

        Strategies:
        1. Sentiment analysis
        2. Thematic clustering
        3. Topic modeling
        """
        # Example: Simple sentiment-based pseudo-labeling
        from textblob import TextBlob

        def get_sentiment_label(poem):
            # Convert poem lines to a single text
            poem_text = ' '.join(poem)

            # Use TextBlob for sentiment analysis
            sentiment = TextBlob(poem_text).sentiment.polarity

            # Categorize sentiment into discrete labels
            if sentiment > 0.5:
                return 2  # Very Positive
            elif sentiment > 0:
                return 1  # Positive
            elif sentiment < -0.5:
                return 4  # Very Negative
            elif sentiment < 0:
                return 3  # Negative
            else:
                return 0  # Neutral

        return [get_sentiment_label(poem) for poem in poems]

    # Prepare data
    poems = df['poems'].tolist()

    # Generate pseudo-labels
    labels = generate_pseudo_labels(poems)

    # Split the data
    train_poems, val_poems, train_labels, val_labels = train_test_split(
        poems, labels, test_size=0.2, random_state=42
    )

    # Load pre-trained model and tokenizer
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(set(labels))
    )

    # Create datasets
    train_dataset = PoemDataset(train_poems, train_labels, tokenizer)
    val_dataset = PoemDataset(val_poems, val_labels, tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./poem_meaning_model',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch"
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Save the model
    model.save_pretrained('./poem_meaning_model')
    tokenizer.save_pretrained('./poem_meaning_model')

    return model, tokenizer

In [None]:
def inference_poem_meanings(model, tokenizer, poems):
    """
    Perform inference on new poems

    Args:
        model: Fine-tuned transformer model
        tokenizer: Corresponding tokenizer
        poems (list): List of poems to extract meanings from

    Returns:
        List of predicted meaning labels
    """
    model.eval()

    # Mapping of labels (customize based on your pseudo-labeling)
    label_mapping = {
        0: 'Neutral',
        1: 'Positive',
        2: 'Very Positive',
        3: 'Negative',
        4: 'Very Negative'
    }

    predictions = []

    for poem in poems:
        poem_text = ' '.join(poem)
        inputs = tokenizer(
            poem_text,
            truncation=True,
            padding=True,
            return_tensors='pt'
        )

        #Check if CUDA Acceleration is available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #Select available device for model to run on
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            predicted_label = torch.argmax(outputs.logits, dim=1).item()
            predictions.append(label_mapping[predicted_label])

    return predictions

In [None]:
from sklearn.model_selection import train_test_split

def main():
    #load in data frame and split data
    train_df, test_df = train_test_split(ECPA_df, test_size=2440, random_state=42, stratify=None)

    #rename lines to poems
    train_df = train_df.rename(columns={'lines': 'poems'})
    test_df = test_df.rename(columns={'lines': 'poems'})

    model, tokenizer = extract_poem_meanings(train_df)

    #perform inference
    meanings = inference_poem_meanings(model, tokenizer, test_df)
    print(meanings)

if __name__ == "__main__":
    main()

#API Key: 8f7e9305d2ef84b9c17e30cd3a661dec0d39e87a

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5637,0.451061
2,0.5364,0.423291
3,0.4404,0.43425


['Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive']


## Topic Modeling

In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample poem data (replace with your actual data)
poems = [
    "The sun is setting, casting shadows long,\nA lonely bird sings a melancholic song.",
    "A gentle breeze whispers through the trees,\nThe river flows, carrying memories.",
    "In fields of gold, where poppies bloom,\nLove's sweet scent fills every room.",
    "The storm rages, thunder roars,\nFear grips the heart, as darkness pours."
]

# Candidate themes (replace with your list)
candidate_themes = [
    "love", "death", "nature", "time", "religion", "war", "identity",
    "isolation", "hope", "loss", "memory", "freedom", "politics",
    "beauty", "struggle", "mortality", "spirituality", "childhood",
    "resilience", "transformation", "nostalgia", "passion", "sorrow"
]

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

processed_poems = [preprocess_text(poem) for poem in poems]

NameError: name 'nltk' is not defined

In [3]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
dtm = vectorizer.fit_transform(processed_poems)

NameError: name 'processed_poems' is not defined

In [5]:
num_topics = len(candidate_themes) #set number of topics to the number of candidate themes.
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(dtm)

NameError: name 'candidate_themes' is not defined

In [None]:
feature_names = vectorizer.get_feature_names_out()

def get_top_words(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    return topics

top_words_per_topic = get_top_words(lda, feature_names, 5) #get 5 top words per topic.

# Match topics to candidate themes (simplest approach: keyword overlap)
theme_matches = {}

for i, topic_words in enumerate(top_words_per_topic):
    best_match = None
    best_overlap = 0
    for theme in candidate_themes:
        overlap = len(set(topic_words) & {theme}) #check if the candidate theme word itself is in the top words.
        if overlap > best_overlap:
            best_overlap = overlap
            best_match = theme
        else:
            for word in topic_words: #check if any of the top words are semantically related to the candidate theme.
                if theme in word: #crude way to check similarity, can be improved with word embeddings.
                    best_overlap = 1
                    best_match = theme
    theme_matches[f"Topic {i+1}"] = best_match

print("Topic Words:")
for i, words in enumerate(top_words_per_topic):
    print(f"Topic {i+1}: {', '.join(words)}")

print("\nTheme Matches:")
print(theme_matches)

#Assign the themes to each poem.
poem_themes = []
doc_topic_dist = lda.transform(dtm)

for doc_index, dist in enumerate(doc_topic_dist):
    topic_index = dist.argmax()
    poem_themes.append(theme_matches[f"Topic {topic_index + 1}"])

poem_theme_df = pd.DataFrame({'poem': poems, 'theme': poem_themes})
print('\nPoem Themes:')
print(poem_theme_df)