In [1]:
from stanfordcorenlp import StanfordCoreNLP
import json
import random
import time

In [2]:
def extract_entities_and_adjectives(text):

    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner',  # POS tagging, Lemmatization, NER
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    for sentence in annotated_json.get('sentences', []):
        sentence_adjectives = set() 
        sentence_names = set() 

        for word in sentence.get('tokens', []):
            word_text = word.get('word')
            pos_tag = word.get('pos')
            ner_tag = word.get('ner')

            if ner_tag == 'PERSON':
                sentence_names.add(word_text)

            if pos_tag and pos_tag.startswith('JJ'):  # Adjective tags like JJ, JJR, JJS
                sentence_adjectives.add(word_text)

       
        for name in sentence_names:
            if name not in entities:
                entities[name] = set()
            
            entities[name].update(sentence_adjectives)

    return entities

def extract_entities_and_adjectives_related(text):
    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner,depparse', 
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    for sentence in annotated_json.get('sentences', []):
        sentence_adjectives = {}  
        sentence_names = set()

        for word in sentence.get('tokens', []):
            word_text = word.get('word')
            pos_tag = word.get('pos')
            ner_tag = word.get('ner')

            if ner_tag == 'PERSON':
                sentence_names.add(word_text)

        for dep in sentence.get('enhancedPlusPlusDependencies', []):
            gov_index = dep.get('governor') - 1  # 1-based to 0-based index
            dep_index = dep.get('dependent') - 1
            dep_relation = dep.get('dep')

            governor_word = sentence['tokens'][gov_index]['word']
            dependent_word = sentence['tokens'][dep_index]['word']

            if dep_relation == 'amod':
                for name in sentence_names:

                    if governor_word == name or dependent_word == name:
                        if name not in sentence_adjectives:
                            sentence_adjectives[name] = set()

                        sentence_adjectives[name].add(dependent_word)

        for name in sentence_names:
            if name not in entities:
                entities[name] = set()
            
            if name in sentence_adjectives:
                entities[name].update(sentence_adjectives[name])

    return entities

def extract_entities_and_adjectives_and_verbs_with_coref(text):

    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner,depparse,coref',
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    coref_persons = {}
    coreferences = annotated_json.get('corefs', {})

    for coref_id, mentions in coreferences.items():
        main_person_name = None

        for mention in mentions:
            sentence_index = mention['sentNum'] - 1
            token_start = mention['startIndex'] - 1
            token_end = mention['endIndex'] - 1
            tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]

            person_tokens = [token for token in tokens if token['ner'] == 'PERSON']
            if person_tokens:
                proper_names = [
                    token['word'] for token in person_tokens if token['pos'] in ['NNP', 'NNPS']
                ]
                if proper_names:
                    main_person_name = " ".join(proper_names)
                    break

        if main_person_name:
            for mention in mentions:
                sentence_index = mention['sentNum'] - 1
                token_start = mention['startIndex'] - 1
                token_end = mention['endIndex'] - 1
                tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]
                mention_text = " ".join(token['word'] for token in tokens)

                coref_persons[mention_text] = main_person_name
    for sentence_index, sentence in enumerate(annotated_json.get('sentences', [])):

        for dep in sentence.get('enhancedPlusPlusDependencies', []):
            gov_index = dep.get('governor') - 1  # 1-based to 0-based index
            dep_index = dep.get('dependent') - 1
            dep_relation = dep.get('dep')

            governor_word = sentence['tokens'][gov_index]['word']
            dependent_word = sentence['tokens'][dep_index]['word']

            if dep_relation == 'amod' and governor_word in coref_persons:
                linked_entity = coref_persons[governor_word]

                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(dependent_word)

            elif dep_relation in ['cop', 'nsubj', 'xcomp', 'acl'] and dependent_word in coref_persons:
                linked_entity = coref_persons[dependent_word]

                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(governor_word)

    return entities

def extract_entities_and_adjectives_with_corefV2(text):
    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner,depparse,coref',
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    coref_persons = {}
    coreferences = annotated_json.get('corefs', {})

    for sentence_index, sentence in enumerate(annotated_json.get('sentences', [])):
        for token in sentence.get('tokens', []):
            if token['ner'] == 'PERSON':
                person_name = token['word']
                if person_name not in coref_persons:
                    coref_persons[person_name] = person_name

    for coref_id, mentions in coreferences.items():
        main_person_name = None

        for mention in mentions:
            sentence_index = mention['sentNum'] - 1
            token_start = mention['startIndex'] - 1
            token_end = mention['endIndex'] - 1
            tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]

            person_tokens = [token for token in tokens if token['ner'] == 'PERSON']
            if person_tokens:
                proper_names = [
                    token['word'] for token in person_tokens if token['pos'] in ['NNP', 'NNPS']
                ]
                if proper_names:
                    main_person_name = " ".join(proper_names)
                    break

        if main_person_name:
            for mention in mentions:
                sentence_index = mention['sentNum'] - 1
                token_start = mention['startIndex'] - 1
                token_end = mention['endIndex'] - 1
                tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]
                mention_text = " ".join(token['word'] for token in tokens)

                coref_persons[mention_text] = main_person_name

    for sentence_index, sentence in enumerate(annotated_json.get('sentences', [])):
        # Use dependency parsing to find adjective, noun, and verb modifiers
        for dep in sentence.get('enhancedPlusPlusDependencies', []):
            gov_index = dep.get('governor') - 1  # 1-based to 0-based index
            dep_index = dep.get('dependent') - 1
            dep_relation = dep.get('dep')

            governor_word = sentence['tokens'][gov_index]['word']
            dependent_word = sentence['tokens'][dep_index]['word']
            governor_pos = sentence['tokens'][gov_index]['pos']
            dependent_pos = sentence['tokens'][dep_index]['pos']

            # Initialize linked_entity as None
            linked_entity = None

            # Check if the dependency is 'amod' a PERSON is mentionned or coref
            if dep_relation == 'amod' and governor_word in coref_persons:
                linked_entity = coref_persons[governor_word]
                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(dependent_word)

            # Check if the relation is a verb or adjective linked to a PERSON entity
            elif dep_relation in ['cop', 'nsubj', 'xcomp', 'acl'] and dependent_word in coref_persons:
                linked_entity = coref_persons[dependent_word]
                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(governor_word)

            # Check if the relation is a noun linked to a PERSON entity
            elif dep_relation in ['nsubj', 'dobj', 'iobj'] and (governor_pos.startswith('NN') or dependent_pos.startswith('NN')):
                if governor_word in coref_persons:
                    linked_entity = coref_persons[governor_word]
                elif dependent_word in coref_persons:
                    linked_entity = coref_persons[dependent_word]

                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(governor_word)
                    entities[linked_entity].add(dependent_word)

    return entities



def extract_entities_and_verbs(text):

    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner', 
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    for sentence in annotated_json.get('sentences', []):
        sentence_verbs = set() 
        sentence_names = set() 

        for word in sentence.get('tokens', []):
            word_text = word.get('word')
            pos_tag = word.get('pos')
            ner_tag = word.get('ner')

            if ner_tag == 'PERSON':
                sentence_names.add(word_text)

            if pos_tag and pos_tag.startswith('VB'):  # verbs tags like VB-based
                sentence_verbs.add(word_text)

       
        for name in sentence_names:
            if name not in entities:
                entities[name] = set()
            
            entities[name].update(sentence_verbs)
        return entities
        
def retry_request(text, retries=3, delay=5):

    for attempt in range(retries):
        try:
            annotated_text = nlp.annotate(text, properties={
                'annotators': 'tokenize,ssplit,pos,ner,sentiment',
                'outputFormat': 'json',
                'timeout': 30000 
            })
            return json.loads(annotated_text)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay) 
            else:
                return None
            
def extract_entity_sentiments(text):
    
    annotated_json = retry_request(text)
    if annotated_json is None:
        print("Failed to process the text.")
        return {}, {}

    entities_sentiments = {}
    overall_sentiment_counts = {"Good": 0, "Neutral": 0, "Bad": 0}

    for sentence in annotated_json.get('sentences', []):
        sentiment = sentence.get('sentiment') 
        sentiment_category = None

        if sentiment in ['Verypositive', 'Positive']:
            sentiment_category = "Good"
        elif sentiment in ['Neutral']:
            sentiment_category = "Neutral"
        elif sentiment in ['Negative', 'Verynegative']:
            sentiment_category = "Bad"

        if sentiment_category:
            overall_sentiment_counts[sentiment_category] += 1

        sentence_entities = {
            token['word'] for token in sentence.get('tokens', [])
            if token.get('ner') == 'PERSON'
        }

        for entity in sentence_entities:
            if entity not in entities_sentiments:
                entities_sentiments[entity] = {"Good": 0, "Neutral": 0, "Bad": 0}
            entities_sentiments[entity][sentiment_category] += 1

    return overall_sentiment_counts, entities_sentiments


In [3]:
file_path = r'C:\Users\valbi\Desktop\Ma3\ADA\Projet1\ada-2024-project-spaghettisolution\data\raw_data\plot_summaries.txt'


file_structure = {
    'columns': ['wiki_id', 'plot_summary'],
    'delimiter': '\t',
    'encoding': 'utf-8'
}

# Print the structure of the extracted file
print("File Structure:")
print(f"Columns: {file_structure['columns']}")
print(f"Delimiter: {file_structure['delimiter']}")
print(f"Encoding: {file_structure['encoding']}")

# Open and read the file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.readlines()
    

# Select 10 random films from the content
random_films = random.sample(content, 5)
print(random_films[1])

File Structure:
Columns: ['wiki_id', 'plot_summary']
Delimiter: 	
Encoding: utf-8
21114814	A miserly man cheats his wife one night. A series from misunderstandings ensue that will entirely change his life.



In [4]:
nlp = StanfordCoreNLP(r'stanford-corenlp-4.5.7\stanford-corenlp-4.5.7')

for film in random_films:
    entities_adjectives = extract_entities_and_adjectives(film)
    entities_adjectives_related = extract_entities_and_adjectives_related(film)
    entities_adjectives_coref = extract_entities_and_adjectives_and_verbs_with_coref(film)
    entities_adjectives_corefV2 = extract_entities_and_adjectives_with_corefV2(film)
    entities_verbs = extract_entities_and_verbs(film)
    overall_sentiment_counts, entities_sentiments = extract_entity_sentiments(film)
    print("\nFilm:")
    print(film)
    
    print("\nAdjectives:")
    for entity, adj_list in entities_adjectives.items():
        print(f"Entity: {entity}, Adjectives: {', '.join(adj_list)}")

    print("\nAdjectives Related:")
    for entity, adj_list in entities_adjectives_related.items():
        print(f"Entity: {entity}, Adjectives: {', '.join(adj_list)}")

    print("\nAdjectives and verbs Coref:")
    for entity, adj_list in entities_adjectives_coref.items():
        print(f"Entity: {entity}, Adjectives: {', '.join(adj_list)}")

    print("\nAdjectives and verbs CorefV2:")
    for entity, adj_list in entities_adjectives_corefV2.items():
        print(f"Entity: {entity}, Adjectives: {', '.join(adj_list)}")

    print("\nVerbs:")
    for entity, verb_list in entities_verbs.items():
        print(f"Entity: {entity}, Verbs: {', '.join(verb_list)}")
    
    print("\nSentiments:")
    for entity, sentiments in entities_sentiments.items():
        print(f"Entity: {entity}, Sentiments: {sentiments}")
        
nlp.close()



Film:
33103584	Opium smugglers work in Sydney. There is a car chase which ends in a crash, a cabaret which turns into a church, a yacht race in Sydney harbour, and 40 bathing beauties.


Adjectives:

Adjectives Related:

Adjectives and verbs Coref:

Adjectives and verbs CorefV2:

Verbs:

Sentiments:

Film:
21114814	A miserly man cheats his wife one night. A series from misunderstandings ensue that will entirely change his life.


Adjectives:

Adjectives Related:

Adjectives and verbs Coref:

Adjectives and verbs CorefV2:

Verbs:

Sentiments:
Attempt 1 failed: Expecting value: line 1 column 1 (char 0)

Film:
8235569	Good and Evil are two sides of a same coin. If good grows in strength so does evil. And there comes a point when one must overcome the other. Dr.Gayatri  is an Indian American paranormal researcher at University of California, researching the esoteric practices like voodoo, spirit possession, magic and healing powers. She seeks to find the hidden knowledge that goes into su

In [5]:
print(random_films[3])

2676497	The film is a romantic melodramaabout two childhood friends who grow up to be soldiers in Austria. One of the friends, Leo,  becomes infatuated with Felicitas , who turns out to be the wife of a powerful count . The count calls Leo out for a duel of honor, but insists that it be done under the false pretense that the quarrel was due to angry words exchanged between the two at a card game in order to protect the count's reputation. Leo kills the count in the duel, but then is punished by the military, being sent away to Africa for five years. Due to Ulrich's intervention, Leo only serves three years before being recalled home. He return journey focuses on his dream of being reunited with Felicitas. Before he left for Africa, Leo had ask Ulrich  to take care of Felicitas' needs while he was away, but Ulrich — unaware that his friend is in love with Felicitas — falls in love with her himself and marries her. Upon his return, Leo finds himself torn between temptation for Felicitas 