In [6]:
import os
import re
import json
import warnings
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag, ne_chunk, Tree
from nltk.corpus import wordnet as wn, stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import pandas as pd
from datetime import datetime
import random

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

class LiteraryFrameAnalyzer:
    def __init__(self, corpus_path, gazetteer_path=None):
        self.corpus_path = corpus_path
        self.gazetteer = {}
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        
        self.frame_definitions = {
            "Revenge": {
                "lexical_units": ["revenge", "avenge", "retaliate", "retribution", "vengeance", 
                                 "reprisal", "payback", "settle", "punish"],
                "frame_elements": {
                    "Avenger": ["PERSON", "ORGANIZATION"],
                    "Offender": ["PERSON", "ORGANIZATION"],
                    "Injury": ["abstract_entity", "event"],
                    "Punishment": ["action", "event"]
                }
            },
            "Hunting": {
                "lexical_units": ["hunt", "track", "pursue", "trap", "kill", "ambush", 
                                 "chase", "stalk", "capture", "snare"],
                "frame_elements": {
                    "Hunter": ["PERSON", "ANIMAL"],
                    "Quarry": ["PERSON", "ANIMAL"],
                    "Instrument": ["physical_object", "artifact"],
                    "Location": ["GPE", "LOCATION"]
                }
            },
            "Travel": {
                "lexical_units": ["travel", "journey", "voyage", "sail", "depart", "explore", 
                                 "roam", "wander", "trek", "embark", "navigate"],
                "frame_elements": {
                    "Traveler": ["PERSON", "ORGANIZATION"],
                    "Source": ["GPE", "LOCATION"],
                    "Goal": ["GPE", "LOCATION"],
                    "Path": ["GPE", "LOCATION"],
                    "Vehicle": ["FACILITY", "vehicle"]
                }
            }
        }
        
        self.text = self._load_corpus()
        self.sentences = sent_tokenize(self.text)
        self.processed_sentences = []
        self.frame_instances = []
        self.word_frequencies = Counter()
        self.bigram_frequencies = Counter()
        
        if gazetteer_path and os.path.exists(gazetteer_path):
            self._load_gazetteer(gazetteer_path)
    
    def _load_corpus(self):
        try:
            with open(self.corpus_path, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            raise FileNotFoundError(f"Corpus file not found: {self.corpus_path}")
    
    def _load_gazetteer(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().rsplit(' ', 1)
                    if len(parts) == 2:
                        name, label = parts
                        self.gazetteer[name.lower()] = label.upper()
    
    def _get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wn.ADJ
        elif treebank_tag.startswith('V'):
            return wn.VERB
        elif treebank_tag.startswith('N'):
            return wn.NOUN
        elif treebank_tag.startswith('R'):
            return wn.ADV
        return wn.NOUN
    
    def _get_hypernym_paths(self, word, pos_tag):
        cache_key = (word.lower(), pos_tag)
        if not hasattr(self, '_hypernym_cache'):
            self._hypernym_cache = {}
        
        if cache_key in self._hypernym_cache:
            return self._hypernym_cache[cache_key]
        
        wn_pos = self._get_wordnet_pos(pos_tag)
        synsets = wn.synsets(word, pos=wn_pos)
        
        all_hypernyms = set()
        for synset in synsets[:1]:
            try:
                hypernyms = list(synset.closure(lambda s: s.hypernyms(), depth=5))
                all_hypernyms.update([h.name().split('.')[0] for h in hypernyms[:10]])
            except:
                pass
        
        self._hypernym_cache[cache_key] = all_hypernyms
        return all_hypernyms
    
    def _apply_ner_with_gazetteer(self, tokens, pos_tags):
        chunked = ne_chunk(pos_tags, binary=False)
        entities = []
        
        for item in chunked:
            if isinstance(item, Tree):
                entity_words = [w for w, t in item.leaves()]
                entity_text = ' '.join(entity_words)
                label = item.label()
                
                if entity_text.lower() in self.gazetteer:
                    label = self.gazetteer[entity_text.lower()]
                
                entities.append({
                    'text': entity_text,
                    'label': label,
                    'tokens': entity_words
                })
            else:
                word, tag = item
                if word.lower() in self.gazetteer:
                    label = self.gazetteer[word.lower()]
                    entities.append({
                        'text': word,
                        'label': label,
                        'tokens': [word]
                    })
        
        return entities
    
    def _match_frame_element(self, word, pos, entities, fe_requirements):
        for entity in entities:
            if word.lower() in entity['text'].lower():
                if entity['label'] in fe_requirements:
                    return entity['text']
        
        hypernyms = self._get_hypernym_paths(word, pos)
        for req in fe_requirements:
            if not req.isupper():
                if req in hypernyms or req.replace('_', ' ') in hypernyms:
                    return word
        
        return None
    
    def _detect_frame_instance(self, sentence_num, tokens, pos_tags, entities):
        if not hasattr(self, '_lu_set'):
            self._lu_set = set()
            for frame_def in self.frame_definitions.values():
                self._lu_set.update(frame_def["lexical_units"])
        
        for frame_name, frame_def in self.frame_definitions.items():
            for i, (word, pos) in enumerate(pos_tags):
                word_lower = word.lower()
                
                if word_lower not in self._lu_set:
                    if pos.startswith('V'):
                        lemma = self.lemmatizer.lemmatize(word_lower, wn.VERB)
                        if lemma not in frame_def["lexical_units"]:
                            continue
                    else:
                        continue
                
                if word_lower in frame_def["lexical_units"] or \
                   self.lemmatizer.lemmatize(word_lower, self._get_wordnet_pos(pos)) in frame_def["lexical_units"]:
                    frame_instance = {
                        'sentence_num': sentence_num,
                        'frame': frame_name,
                        'lexical_unit': word,
                        'lu_position': i,
                        'elements': {}
                    }
                    
                    context_window = 10
                    start = max(0, i - context_window)
                    end = min(len(pos_tags), i + context_window)
                    
                    for fe_name, fe_requirements in frame_def["frame_elements"].items():
                        for j in range(start, end):
                            if j != i:
                                w, p = pos_tags[j]
                                match = self._match_frame_element(w, p, entities, fe_requirements)
                                if match and fe_name not in frame_instance['elements']:
                                    frame_instance['elements'][fe_name] = match
                    
                    return frame_instance
        
        return None
    
    def _build_language_model(self):
        for sent in self.processed_sentences:
            words = [w.lower() for w, p in sent['pos_tags'] if w.isalpha() and w.lower() not in self.stopwords]
            self.word_frequencies.update(words)
            
            for i in range(len(words) - 1):
                self.bigram_frequencies[(words[i], words[i+1])] += 1
    
    def process_corpus(self):
        print(f"Processing {len(self.sentences)} sentences...")
        
        for sent_num, sentence in enumerate(self.sentences):
            if sent_num % 1000 == 0:
                print(f"  Progress: {sent_num}/{len(self.sentences)} sentences")
            
            tokens = word_tokenize(sentence)
            pos_tags = pos_tag(tokens)
            entities = self._apply_ner_with_gazetteer(tokens, pos_tags)
            
            self.processed_sentences.append({
                'sentence_num': sent_num,
                'text': sentence,
                'tokens': tokens,
                'pos_tags': pos_tags,
                'entities': entities
            })
            
            frame_instance = self._detect_frame_instance(sent_num, tokens, pos_tags, entities)
            if frame_instance:
                frame_instance['sentence'] = sentence
                self.frame_instances.append(frame_instance)
        
        self._build_language_model()
        self._compute_sentiments()
        
        print(f"Processing complete")
    
    def _compute_sentiments(self):
        try:
            from nltk.sentiment import SentimentIntensityAnalyzer
            sia = SentimentIntensityAnalyzer()
            
            for instance in self.frame_instances:
                scores = sia.polarity_scores(instance['sentence'])
                instance['sentiment_score'] = scores['compound']
                
                if scores['compound'] > 0.05:
                    instance['sentiment_label'] = 'positive'
                elif scores['compound'] < -0.05:
                    instance['sentiment_label'] = 'negative'
                else:
                    instance['sentiment_label'] = 'neutral'
        except:
            for instance in self.frame_instances:
                instance['sentiment_score'] = 0
                instance['sentiment_label'] = 'neutral'
    
    def display_sample_results(self, num_samples=5):
        print("\n" + "="*80)
        print(f"Examples (showing {min(num_samples, len(self.frame_instances))} of {len(self.frame_instances)} total)")
        print("="*80 + "\n")
        
        for i, instance in enumerate(self.frame_instances[:num_samples]):
            print(f"Example {i+1} (Sentence {instance['sentence_num']})")
            print(f"FRAME TYPE: {instance['frame']}")
            print(f"TRIGGER WORD: '{instance['lexical_unit']}'")
            print(f"\nSENTENCE:\n{instance['sentence']}\n")
            
            if instance['elements']:
                print("DETECTED ROLES:")
                for role, value in instance['elements'].items():
                    print(f"  {role}: {value}")
            else:
                print("(No frame elements extracted)")
            
            print()
    
    def get_statistics(self):
        frame_counts = Counter(inst['frame'] for inst in self.frame_instances)
        
        stats = {
            'total_sentences': len(self.sentences),
            'sentences_with_frames': len(self.frame_instances),
            'detection_rate_pct': round(len(self.frame_instances) / len(self.sentences) * 100, 2),
            'frame_counts': dict(frame_counts)
        }
        return stats
    
    def print_summary(self):
        stats = self.get_statistics()
        
        print("\n" + "="*80)
        print("ANALYSIS SUMMARY")
        print("="*80)
        print(f"\nBook: {os.path.basename(self.corpus_path)}")
        print(f"Total Sentences: {stats['total_sentences']:,}")
        print(f"Sentences with Frames Detected: {stats['sentences_with_frames']:,}")
        print(f"Detection Rate: {stats['detection_rate_pct']}%")
        print("\nBREAKDOWN BY SCENE TYPE:")
        for frame, count in sorted(stats['frame_counts'].items(), key=lambda x: x[1], reverse=True):
            pct = round(count / stats['sentences_with_frames'] * 100, 1)
            print(f"  {frame}: {count} instances ({pct}% of all detected frames)")
    
    def print_sentiment_analysis(self):
        print("\n" + "="*80)
        print("SENTIMENT ANALYSIS")
        print("="*80 + "\n")
        
        sentiment_by_frame = defaultdict(list)
        for instance in self.frame_instances:
            sentiment_by_frame[instance['frame']].append(instance['sentiment_score'])
        
        for frame in ['Revenge', 'Hunting', 'Travel']:
            if frame in sentiment_by_frame:
                scores = sentiment_by_frame[frame]
                avg_score = sum(scores) / len(scores)
                
                pos_count = sum(1 for s in scores if s > 0.05)
                neg_count = sum(1 for s in scores if s < -0.05)
                neu_count = len(scores) - pos_count - neg_count
                
                print(f"{frame} scenes:")
                print(f"  Average sentiment: {avg_score:.3f}")
                print(f"  Positive: {pos_count}, Negative: {neg_count}, Neutral: {neu_count}")
                print()
    
    def print_temporal_distribution(self):
        print("="*80)
        print("TEMPORAL DISTRIBUTION")
        print("="*80 + "\n")
        
        total = len(self.sentences)
        chunk_size = total // 10
        
        distribution = {f: [0]*10 for f in ['Revenge', 'Hunting', 'Travel']}
        
        for instance in self.frame_instances:
            chunk = min(instance['sentence_num'] // chunk_size, 9)
            distribution[instance['frame']][chunk] += 1
        
        print("Frame occurrences across the book (divided into 10 equal segments):\n")
        
        for frame, counts in distribution.items():
            total_frame = sum(counts)
            if total_frame > 0:
                print(f"{frame}:")
                bars = ['|' + '*' * (count * 40 // max(counts)) if count > 0 else '|' for count in counts]
                for i, (count, bar) in enumerate(zip(counts, bars)):
                    print(f"  Section {i+1:2d}: {bar} {count}")
                print()
    
    def print_vocabulary_analysis(self):
        from sklearn.feature_extraction.text import TfidfVectorizer
        
        print("="*80)
        print("VOCABULARY ANALYSIS")
        print("="*80 + "\n")
        
        frame_texts = defaultdict(list)
        for instance in self.frame_instances:
            frame_texts[instance['frame']].append(instance['sentence'])
        
        print("Most distinctive words by frame type:\n")
        
        for frame in ['Revenge', 'Hunting', 'Travel']:
            if frame in frame_texts and len(frame_texts[frame]) > 1:
                vectorizer = TfidfVectorizer(max_features=8, stop_words='english')
                try:
                    tfidf = vectorizer.fit_transform(frame_texts[frame])
                    words = vectorizer.get_feature_names_out()
                    print(f"{frame}: {', '.join(words)}")
                except:
                    print(f"{frame}: insufficient data")
    
    def print_word_frequency_analysis(self):
        print("\n" + "="*80)
        print("WORD FREQUENCY ANALYSIS")
        print("="*80 + "\n")
        
        top_words = self.word_frequencies.most_common(20)
        
        print("Most frequent words in the text:\n")
        
        max_count = top_words[0][1] if top_words else 1
        
        for word, count in top_words:
            bar_length = int((count / max_count) * 40)
            bar = '=' * bar_length
            print(f"  {word:15s} {bar} {count:,}")
        
        total_words = sum(self.word_frequencies.values())
        unique_words = len(self.word_frequencies)
        
        print(f"\nTotal words: {total_words:,}")
        print(f"Unique words: {unique_words:,}")
        print(f"Vocabulary richness: {unique_words/total_words:.4f}")
    
    def generate_random_sentence(self):
        print("\n" + "="*80)
        print("PROBABILISTIC SENTENCE GENERATION")
        print("="*80 + "\n")
        
        if not self.bigram_frequencies:
            print("No language model available")
            return
        
        sentence_words = []
        
        all_first_words = [w1 for w1, w2 in self.bigram_frequencies.keys()]
        current = random.choice(all_first_words)
        sentence_words.append(current)
        
        for _ in range(random.randint(8, 15)):
            next_options = [(w2, count) for (w1, w2), count in self.bigram_frequencies.items() if w1 == current]
            
            if not next_options:
                break
            
            total = sum(count for _, count in next_options)
            rand_val = random.uniform(0, total)
            
            cumsum = 0
            for next_word, count in next_options:
                cumsum += count
                if rand_val <= cumsum:
                    current = next_word
                    sentence_words.append(current)
                    break
        
        generated = ' '.join(sentence_words).capitalize() + '.'
        
        print("Generated sentence based on bigram probabilities however may not follow all grammar rules due to this:\n")
        print(f'"{generated}"')
    
    def export_to_csv(self, filename='frame_detections.csv'):
        data = []
        for instance in self.frame_instances:
            row = {
                'Sentence_Number': instance['sentence_num'],
                'Frame_Type': instance['frame'],
                'Trigger_Word': instance['lexical_unit'],
                'Sentiment': instance['sentiment_label'],
                'Sentiment_Score': instance['sentiment_score'],
                'Full_Sentence': instance['sentence']
            }
            
            for role in ['Hunter', 'Quarry', 'Instrument', 'Location', 
                        'Avenger', 'Offender', 'Injury', 'Punishment',
                        'Traveler', 'Source', 'Goal', 'Path', 'Vehicle']:
                row[role] = instance['elements'].get(role, '')
            
            data.append(row)
        
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"\nExported {len(df)} detections to '{filename}'")
        return df


if __name__ == "__main__":
    print("\n" + "="*80)
    print("LITERARY SEMANTIC FRAME ANALYZER")
    print("="*80 + "\n")
    
    analyzer = LiteraryFrameAnalyzer(
        corpus_path='Moby Dick.txt',
        gazetteer_path='gazeteer2.0'
    )
    
    analyzer.process_corpus()
    
    analyzer.print_summary()
    
    analyzer.print_sentiment_analysis()
    
    analyzer.print_temporal_distribution()
    
    analyzer.print_vocabulary_analysis()
    
    analyzer.print_word_frequency_analysis()
    
    analyzer.generate_random_sentence()
    
    analyzer.display_sample_results(num_samples=20)
    
    df = analyzer.export_to_csv('moby_dick_frame_analysis.csv')


LITERARY SEMANTIC FRAME ANALYZER

Processing 9184 sentences...
  Progress: 0/9184 sentences
  Progress: 1000/9184 sentences
  Progress: 2000/9184 sentences
  Progress: 3000/9184 sentences
  Progress: 4000/9184 sentences
  Progress: 5000/9184 sentences
  Progress: 6000/9184 sentences
  Progress: 7000/9184 sentences
  Progress: 8000/9184 sentences
  Progress: 9000/9184 sentences
Processing complete

ANALYSIS SUMMARY

Book: Moby Dick.txt
Total Sentences: 9,184
Sentences with Frames Detected: 512
Detection Rate: 5.57%

What this means: Out of every 100 sentences, about 5 contain revenge, hunting, or travel scenes.

BREAKDOWN BY SCENE TYPE:
  Travel: 255 instances (49.8% of all detected frames)
  Hunting: 214 instances (41.8% of all detected frames)
  Revenge: 43 instances (8.4% of all detected frames)

SENTIMENT ANALYSIS

Revenge scenes:
  Average sentiment: -0.081
  Positive: 15, Negative: 21, Neutral: 7

Hunting scenes:
  Average sentiment: -0.106
  Positive: 83, Negative: 104, Neutral: