In [3]:
from stanfordcorenlp import StanfordCoreNLP
import json
import random
import time
import pandas as pd
import numpy as np
import random

In [18]:
def prepare_persona_data(genres_filter=None, country_filter=None, title=None):
    movies_df = pd.read_csv(f'data\cleaned_data\movies_data.csv')
    
    if genres_filter:
        if country_filter:
            new_movies_df = movies_df[(movies_df['genres'].str.contains(genres_filter, case=False, na=False)) & 
                                      (movies_df['country'].str.contains(country_filter, case=False, na=False))]
        else:
            new_movies_df = movies_df[movies_df['genres'].str.contains(genres_filter, case=False, na=False)]
    else:
        if country_filter:
            new_movies_df = movies_df[movies_df['country'].str.contains(country_filter, case=False, na=False)]
        else:
            new_movies_df = movies_df
    
    new_movies_df['Persona'] = None
    new_movies_df.to_csv(f'data\cleaned_data\{title}_movies_data.csv', index=False)
    return new_movies_df


def extract_entities_and_related_words_with_coref(text):
    annotated_text = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,ner,depparse,coref',
        'outputFormat': 'json'
    })
    
    try:
        annotated_json = json.loads(annotated_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing the JSON response: {e}")
        return {}

    entities = {}

    coref_persons = {}
    coreferences = annotated_json.get('corefs', {})

    for sentence_index, sentence in enumerate(annotated_json.get('sentences', [])):
        for token in sentence.get('tokens', []):
            if token['ner'] == 'PERSON':
                person_name = token['word']
                if person_name not in coref_persons:
                    coref_persons[person_name] = person_name

    for coref_id, mentions in coreferences.items():
        main_person_name = None

        for mention in mentions:
            #adapt the index to 0-based for python
            sentence_index = mention['sentNum'] - 1
            token_start = mention['startIndex'] - 1
            token_end = mention['endIndex'] - 1
            tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]

            person_tokens = [token for token in tokens if token['ner'] == 'PERSON']
            if person_tokens:
                proper_names = [
                    token['word'] for token in person_tokens if token['pos'] in ['NNP', 'NNPS']
                ]
                if proper_names:
                    main_person_name = " ".join(proper_names)
                    break

        if main_person_name:
            for mention in mentions:
                sentence_index = mention['sentNum'] - 1
                token_start = mention['startIndex'] - 1
                token_end = mention['endIndex'] - 1
                tokens = annotated_json['sentences'][sentence_index]['tokens'][token_start:token_end]
                mention_text = " ".join(token['word'] for token in tokens)

                coref_persons[mention_text] = main_person_name

    for sentence_index, sentence in enumerate(annotated_json.get('sentences', [])):
        # Use dependency parsing to find adjective, noun, and verb modifiers
        for dep in sentence.get('enhancedPlusPlusDependencies', []):
            gov_index = dep.get('governor') - 1  
            dep_index = dep.get('dependent') - 1
            dep_relation = dep.get('dep')

            governor_word = sentence['tokens'][gov_index]['word']
            dependent_word = sentence['tokens'][dep_index]['word']
            governor_pos = sentence['tokens'][gov_index]['pos']
            dependent_pos = sentence['tokens'][dep_index]['pos']

            linked_entity = None

            # Check if the dependency is 'amod' a PERSON is mentionned or coref
            if dep_relation == 'amod' and governor_word in coref_persons:
                linked_entity = coref_persons[governor_word]
                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(dependent_word)

            # Check if the relation is a verb or adjective linked to a PERSON entity
            elif dep_relation in ['cop', 'nsubj', 'xcomp', 'acl'] and dependent_word in coref_persons:
                linked_entity = coref_persons[dependent_word]
                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(governor_word)

            # Check if the relation is a noun linked to a PERSON entity
            elif dep_relation in ['nsubj', 'dobj', 'iobj'] and (governor_pos.startswith('NN') or dependent_pos.startswith('NN')):
                if governor_word in coref_persons:
                    linked_entity = coref_persons[governor_word]
                elif dependent_word in coref_persons:
                    linked_entity = coref_persons[dependent_word]

                if linked_entity:
                    if linked_entity not in entities:
                        entities[linked_entity] = set()
                    entities[linked_entity].add(governor_word)
                    entities[linked_entity].add(dependent_word)

    return entities
        
def retry_request(text, retries=3, delay=5):

    for attempt in range(retries):
        try:
            annotated_text = nlp.annotate(text, properties={
                'annotators': 'tokenize,ssplit,pos,ner,sentiment',
                'outputFormat': 'json',
                'timeout': 30000 
            })
            return json.loads(annotated_text)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay) 
            else:
                return None
            
def extract_entity_sentiments(text):
    
    annotated_json = retry_request(text)
    if annotated_json is None:
        print("Failed to process the text.")
        return {}, {}

    entities_sentiments = {}
    overall_sentiment_counts = {"Good": 0, "Neutral": 0, "Bad": 0}

    for sentence in annotated_json.get('sentences', []):
        sentiment = sentence.get('sentiment') 
        sentiment_category = None

        if sentiment in ['Verypositive', 'Positive']:
            sentiment_category = "Good"
        elif sentiment in ['Neutral']:
            sentiment_category = "Neutral"
        elif sentiment in ['Negative', 'Verynegative']:
            sentiment_category = "Bad"

        if sentiment_category:
            overall_sentiment_counts[sentiment_category] += 1

        sentence_entities = {
            token['word'] for token in sentence.get('tokens', [])
            if token.get('ner') == 'PERSON'
        }

        for entity in sentence_entities:
            if entity not in entities_sentiments:
                entities_sentiments[entity] = {"Good": 0, "Neutral": 0, "Bad": 0}
            entities_sentiments[entity][sentiment_category] += 1

    return overall_sentiment_counts, entities_sentiments

def process_personas_movies(df):
    nlp = StanfordCoreNLP(r'stanford-corenlp-4.5.7\stanford-corenlp-4.5.7')
    total_rows = len(df)

    for idx, (index, row) in enumerate(df.iterrows(), start=1):
        print(f"Processing {idx}/{total_rows}")
        df.at[index, 'Persona'] = extract_entities_and_related_words_with_coref(row['plot']) if pd.notnull(row['plot']) else {}

    nlp.close()
    return df



In [6]:
file_path = r'C:\Users\valbi\Desktop\Ma3\ADA\Projet1\ada-2024-project-spaghettisolution\data\raw_data\plot_summaries.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    content = file.readlines()
    
# Select random films from the content
random_films = random.sample(content, 3)

["30316694\tAlisa Seleznyova and her father professor Seleznyov are travelling in space. They meet their old friend archaeologist Gromozeka, who's just discovered a planet all inhabitants of which died. It became known that they discovered a virus of hostility, got infected and killed each other. Gromozeka also discovered that they had left the virus on Earth 26000 years ago, and the virus is about to become loose. The only chance to save the Earth is to travel 26000 years back in time - to the epoch when witches, dragons and magicians lived along with usual people.{{ru icon}}Infos on mielofon.ru\n", '2812852\tIn Chicago, Darius Lovehall  is a poet who is giving a reading at the Sanctuary, an upscale nightclub presenting jazz and poetry to a bohemian clientele. Shortly before his set, he meets Nina Mosley , a gifted photographer who recently lost her job. They exchange small talk, and Darius makes his interest clear when he retitles his love poem "A Blues For Nina". A mutual attraction

In [8]:
nlp = StanfordCoreNLP(r'../stanford-corenlp-4.5.7\stanford-corenlp-4.5.7')

for film in random_films:
    entities_adjectives_corefV2 = extract_entities_and_related_words_with_coref(film)
    overall_sentiment_counts, entities_sentiments = extract_entity_sentiments(film)
    print("\nFilm:")
    print(film)

    print("\nAdjectives and verbs CorefV2:")
    for entity, adj_list in entities_adjectives_corefV2.items():
        print(f"Entity: {entity}, Adjectives: {', '.join(adj_list)}")

    print("\nSentiments:")
    for entity, sentiments in entities_sentiments.items():
        print(f"Entity: {entity}, Sentiments: {sentiments}")
        
nlp.close()



Film:
30316694	Alisa Seleznyova and her father professor Seleznyov are travelling in space. They meet their old friend archaeologist Gromozeka, who's just discovered a planet all inhabitants of which died. It became known that they discovered a virus of hostility, got infected and killed each other. Gromozeka also discovered that they had left the virus on Earth 26000 years ago, and the virus is about to become loose. The only chance to save the Earth is to travel 26000 years back in time - to the epoch when witches, dragons and magicians lived along with usual people.{{ru icon}}Infos on mielofon.ru


Adjectives and verbs CorefV2:
Entity: Seleznyova, Adjectives: travelling
Entity: Alisa Seleznyova Seleznyov, Adjectives: left, meet, discovered
Entity: Gromozeka, Adjectives: discovered

Sentiments:
Entity: Alisa, Sentiments: {'Good': 0, 'Neutral': 1, 'Bad': 0}
Entity: Seleznyov, Sentiments: {'Good': 0, 'Neutral': 1, 'Bad': 0}
Entity: Seleznyova, Sentiments: {'Good': 0, 'Neutral': 1, 'Ba

In [20]:
war_action_movies_df = prepare_persona_data(genres_filter='war|action',title='war_action')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['Persona'] = None


In [26]:
war_action_movies_df = pd.read_csv('data\cleaned_data\war_action_movies_data.csv')
war_action_movies_df = process_personas_movies(war_action_movies_df)
war_action_movies_df.to_csv('data\cleaned_data\war_action_movies_data.csv', index=False)

Processing 1/4000
Processing 2/4000
Processing 3/4000
Processing 4/4000
Processing 5/4000
Processing 6/4000
Processing 7/4000
Processing 8/4000
Processing 9/4000
Processing 10/4000
Processing 11/4000
Processing 12/4000
Processing 13/4000
Processing 14/4000
Processing 15/4000
Processing 16/4000
Processing 17/4000
Processing 18/4000
Processing 19/4000
Processing 20/4000
Processing 21/4000
Processing 22/4000
Processing 23/4000
Processing 24/4000
Processing 25/4000
Processing 26/4000
Processing 27/4000
Processing 28/4000
Processing 29/4000
Processing 30/4000
Processing 31/4000
Processing 32/4000
Processing 33/4000
Processing 34/4000
Processing 35/4000
Processing 36/4000
Processing 37/4000
Processing 38/4000
Processing 39/4000
Processing 40/4000
Processing 41/4000
Processing 42/4000
Processing 43/4000
Processing 44/4000
Processing 45/4000
Processing 46/4000
Processing 47/4000
Processing 48/4000
Processing 49/4000
Processing 50/4000
Processing 51/4000
Processing 52/4000
Processing 53/4000
Pr

In [27]:
war_action_movies_df.to_csv(f'war_action_movies_with_persona.csv', index=False)

In [22]:
USA_movies_df = prepare_persona_data(genres_filter="", country_filter='United States of America', title='USA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_df['Persona'] = None


In [5]:
USA_movies_df = process_personas_movies(USA_movies_df)
USA_movies_df.to_csv(f'data/cleaned_data/USA_movies_with_persona.csv', index=False)

(4000, 8)
Processing 1/4000
done
Processing 2/4000
Processing 3/4000
Processing 4/4000
Processing 5/4000
done
Processing 6/4000
Processing 7/4000
Processing 8/4000
Processing 9/4000
Processing 10/4000
Processing 11/4000
Processing 12/4000
Processing 13/4000
Processing 14/4000
Processing 15/4000
Processing 16/4000
Processing 17/4000
Processing 18/4000
Processing 19/4000
Processing 20/4000
Processing 21/4000
Processing 22/4000
Processing 23/4000
Processing 24/4000
done
Processing 25/4000
Processing 26/4000
Processing 27/4000
Processing 28/4000
Processing 29/4000
done
Processing 30/4000
Processing 31/4000
done
Processing 32/4000
Processing 33/4000
done
Processing 34/4000
Processing 35/4000
Processing 36/4000
done
Processing 37/4000
done
Processing 38/4000
Processing 39/4000
done
Processing 40/4000
done
Processing 41/4000
done
Processing 42/4000
done
Processing 43/4000
Processing 44/4000
Processing 45/4000
Processing 46/4000
done
Processing 47/4000
Processing 48/4000
Processing 49/4000
don

In [35]:
dfkaggle = pd.read_csv(r'data/raw/kaggleData/wiki_movie_plots_deduped.csv')

In [37]:
filtered_df = dfkaggle[(dfkaggle['Origin/Ethnicity'] == 'American') & 
                       (dfkaggle['Genre'].str.contains('Action|War', case=False, na=False))]


(1209, 8)

In [39]:
filtered_df = process_personas_movies(filtered_df)
filtered_df.to_csv(f'/data/clean_data/american_action_war_kaggle_with_persons', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Persona'] = None


Processing 1/1209
done
Processing 2/1209
done
Processing 3/1209
done
Processing 4/1209
done
Processing 5/1209
done
Processing 6/1209
Error parsing the JSON response: Expecting value: line 1 column 1 (char 0)
Processing 7/1209
done
Processing 8/1209
done
Processing 9/1209
done
Processing 10/1209
done
Processing 11/1209
done
Processing 12/1209
Error parsing the JSON response: Expecting value: line 1 column 1 (char 0)
Processing 13/1209
done
Processing 14/1209
Error parsing the JSON response: Expecting value: line 1 column 1 (char 0)
Processing 15/1209
done
Processing 16/1209
done
Processing 17/1209
done
Processing 18/1209
done
Processing 19/1209
done
Processing 20/1209
done
Processing 21/1209
done
Processing 22/1209
done
Processing 23/1209
done
Processing 24/1209
done
Processing 25/1209
done
Processing 26/1209
done
Processing 27/1209
done
Processing 28/1209
done
Processing 29/1209
done
Processing 30/1209
done
Processing 31/1209
done
Processing 32/1209
done
Processing 33/1209
done
Process

In [31]:
filtered_df = pd.read_csv('data/cleaned_data/american_action_war_kaggle_with_persons.csv')

In [32]:
dfwar_action = pd.read_csv('data/cleaned_data/war_action_movies_with_persona.csv')


In [36]:
filtered_titles_years = set(zip(filtered_df['Title'], filtered_df['Release Year']))
war_action_titles_years = set(zip(dfwar_action['name'].str.split(',').str[0], dfwar_action['release_date']))

common_titles_years = filtered_titles_years.intersection(war_action_titles_years)

not_common_titles_years = filtered_titles_years.symmetric_difference(war_action_titles_years)



dfwar_action_copy = dfwar_action.copy()

for index, row in filtered_df.iterrows():
    title = row['Title']
    release_year = row['Release Year']
    if (title, release_year) in common_titles_years:
        war_action_index = dfwar_action_copy[(dfwar_action_copy['name'].str.contains(title)) & (dfwar_action_copy['release_date'] == release_year)].index
        if not war_action_index.empty:
            # Update the plot and persona in dfwar_action_copy only if Persona is not empty
            if row['Persona'] != '{}':
                dfwar_action_copy.at[war_action_index[0], 'plot'] = row['Plot']
                dfwar_action_copy.at[war_action_index[0], 'Persona'] = row['Persona']
    else:
        new_row = pd.DataFrame([{
            'name': row['Title'],
            'country': row['Origin/Ethnicity'],
            'genres': row['Genre'],
            'plot': row['Plot'],
            'Persona': row['Persona']
        }])
        dfwar_action_copy = pd.concat([dfwar_action_copy, new_row], ignore_index=True)

dfwar_action_copy.to_csv('data/cleaned_data/updated_war_action_movies_with_persons.csv', index=False)