In [None]:
# Creating deep copy of the plots_summaries dataframes creatde above 
df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

# Verify if laptop running the script has a GPU and CUDA enabled
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu() # enable GPU use for Spacy, taken from: https://stackoverflow.com/questions/75355264/how-to-enable-cuda-gpu-acceleration-for-spacy-on-windows 
    nlp = spacy.load("en_core_web_trf") # uses a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# 
def extract_GPE_and_compute_US_frequencies_batch(summaries, us_locations):
    """
    Function that implement batch processing of plot summaries and extract GPE entities and frequencies. If available on the machine running the notebook, CUDA is enabled for faster processing.
    params: texts called 'summaries' and a set of US locations containing the cities, counties and the states
    returns: all the GPE entities and all the US GPE frequencies detected in the summary as a list
    """
    results = []
    for doc in nlp.pipe(summaries, batch_size=500):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        GPE_US_frequencies = Counter(ent for ent in GPE_entities if ent in us_locations)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Apply batch processing with GPU
df_plots_us_partially_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = \
    pd.DataFrame(extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_partially_movies_GPE['Summary'].tolist(), us_locations_set # converts a Pandas Series into a Python list, as nlp.pipe expects a seqeunce of string according to the documentation https://spacy.io/api/language#pipe 
    ))

df_us_partially_movies_NLP_GPE = df_plots_us_partially_movies_GPE.drop(columns=['Summary']) # keeping all the info of the original df, the new GPE columns and dropping the summaries for more clarity and later use

df_us_partially_movies_NLP_GPE.to_csv("data/us_partially_movies_NLP_GPE.csv", sep=',', encoding='utf-8', index=False, header = True)# hard code the encoding to avoid anay problems as seen in the lecture

print("NLP processing done.")

In [None]:
df_plots_us_only_movies_GPE = plot_summaries_us_movies.copy()

start_time = time.time()
print("Processing starting.")

# Verify if laptop running the script has a GPU and CUDA enabled
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu() # enable GPU use for Spacy, taken from: https://stackoverflow.com/questions/75355264/how-to-enable-cuda-gpu-acceleration-for-spacy-on-windows 
    nlp = spacy.load("en_core_web_trf") # uses a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# 
def extract_GPE_and_compute_US_frequencies_batch(summaries, us_locations):
    """
    Function that implement batch processing of plot summaries and extract GPE entities and frequencies. If available on the machine running the notebook, CUDA is enabled for faster processing.
    params: texts called 'summaries' and a set of US locations containing the cities, counties and the states
    returns: all the GPE entities and all the US GPE frequencies detected in the summary as a list
    """
    results = []
    for doc in nlp.pipe(summaries, batch_size=200):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        GPE_US_frequencies = Counter(ent for ent in GPE_entities if ent in us_locations)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Apply batch processing with GPU
df_plots_us_only_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = \
    pd.DataFrame(extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_only_movies_GPE['Summary'].tolist(), us_locations_set # converts a Pandas Series into a Python list, as nlp.pipe expects a seqeunce of string according to the documentation https://spacy.io/api/language#pipe 
    ))

df_us_only_movies_NLP_GPE = df_plots_us_only_movies_GPE.drop(columns=['Summary']) # keeping all the info of the original df, the new GPE columns and dropping the summaries for more clarity and later use

df_us_only_movies_NLP_GPE.to_csv("data/us_only_movies_NLP_GPE.csv", sep=',', encoding='utf-8', index=False, header = True)# hard code the encoding to avoid anay problems as seen in the lecture

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(geocode_location, locations), total=len(locations), desc="Geocoding Locations"))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Apply batch processing with GPU
df_plots_us_partially_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
    extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_partially_movies_GPE['Summary'].tolist()
    ))

# Drop summaries and save results
df_us_partially_movies_NLP_GPE = df_plots_us_partially_movies_GPE.drop(columns=['Summary'])
df_us_partially_movies_NLP_GPE.to_csv("data/NLP_datasets/us_partially_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")


# TEST AVEC NOMINATIM Sur les plots problématiques

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers (cf. https://spacy.io/models/en/)
    print("Using Spacy English transformer pipeline.")
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")
    print("Using Spacy English pipeline optimized for CPU.")


# Initialize caching dictionary
geocode_cache = {}

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Cached geocoding function tio avoid looking sevarl time for the same location of the geopy API
def geocode_location_cached(location):
    if location not in geocode_cache:
        for attempt in range(3):  # Retry up to 3 times
            try:
                time.sleep(1)  # Respect API rate limits
                geocode_cache[location] = geocode_location(location)
                break
            except GeocoderRateLimited:
                print(f"Rate-limited for location: {location}. Retrying...")
                time.sleep(10)  # Wait before retrying
    return geocode_cache[location]

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(geocode_location_cached, locations))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Filter DataFrame for selected movie titles
selected_titles_2 = ["The Hunger Games", "Dark Water", "Meet John Doe","Exodus"] # 
filtered_df_2 = plot_summaries_us_movies[plot_summaries_us_movies['title'].isin(selected_titles_2)].copy()

# Apply batch processing with GPU
# filtered_df[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
#     extract_GPE_and_compute_US_frequencies_batch(
#         filtered_df['Summary'].tolist()
#     ))

extracted_results = extract_GPE_and_compute_US_frequencies_batch(filtered_df_2['Summary'].tolist())
filtered_df_2['GPE_entities'] = [res[0] for res in extracted_results]
filtered_df_2['GPE_US_frequencies'] = [res[1] for res in extracted_results]
filtered_df_2['Percentage_american_culture'] = [res[2] for res in extracted_results]


# Drop summaries and save results
filtered_df_2 = filtered_df_2.drop(columns=['Summary'])
# filtered_df.drop(columns=['Summary'], inplace=True)
filtered_df_2.to_csv("data/NLP_datasets/us_only_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

filtered_df_2.head()


In [None]:

if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers (cf. https://spacy.io/models/en/)
    print("Using Spacy English transformer pipeline.")
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")
    print("Using Spacy English pipeline optimized for CPU.")


# Initialize caching dictionary
geocode_cache = {}

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Cached geocoding function tio avoid looking sevarl time for the same location of the geopy API
def geocode_location_cached(location):
    if location not in geocode_cache:
        geocode_cache[location] = geocode_location(location)
    return geocode_cache[location]

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(geocode_location_cached, locations))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Filter DataFrame for selected movie titles
selected_titles = ["Come Back, Africa", "A Cry in the Dark", "End Game","Eastern Promises",  "Sophie's Choice"] # 
filtered_df = plot_summaries_us_partially_movies[plot_summaries_us_partially_movies['title'].isin(selected_titles)].copy()

# Apply batch processing with GPU
# filtered_df[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
#     extract_GPE_and_compute_US_frequencies_batch(
#         filtered_df['Summary'].tolist()
#     ))

extracted_results = extract_GPE_and_compute_US_frequencies_batch(filtered_df['Summary'].tolist())
filtered_df['GPE_entities'] = [res[0] for res in extracted_results]
filtered_df['GPE_US_frequencies'] = [res[1] for res in extracted_results]
filtered_df['Percentage_american_culture'] = [res[2] for res in extracted_results]


# Drop summaries and save results
filtered_df = filtered_df.drop(columns=['Summary'])
# filtered_df.drop(columns=['Summary'], inplace=True)
filtered_df.to_csv("data/NLP_datasets/us_partially_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

print(filtered_df.columns)  
filtered_df.head()


# Test avec small spacy model 'en_core_sm'

In [None]:
# nlp = spacy.load('en_core_web_sm')

# # Creating deep copy of the plots_summaries dataframes creatde above 
# df_plots_all_movies_GPE = plot_summaries_all_movies.copy()
# df_plots_us_movies_GPE = plot_summaries_us_movies.copy()
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy() # run en 5'30"
# df_plots_RoW_GPE = plot_summaries_RoW_movies.copy()

# # since GPE is part of the NER spacy pipeline, no need to tokenize manually the text before implementing the search for GPE quantities
# # def extract_gpe(summary):
# #     """ Function that extract the GPE entities of a plot summary
# #     params: a text called 'summary'
# #     returns: all the GPE entities detected in the summary
# #     """
# #     doc = nlp(summary)
# #     return [ent.text for ent in doc.ents if ent.label_ == 'GPE'] # outputs all detected geopolitical entities, including repeated mentions of the same geographical entitities

# # def compute_US_GPE_frequencies(summary, set_us_location):
# #     """ Function that extract the US GPE frequencies of a plot summary
# #     params: a text called 'summary' and a set of US locations containing the cities, counties and the states
# #     returns: all the US GPE frequencies detected in the summary
# #     """

# def extract_GPE_and_compute_US_frequencies(summary, set_us_location):
#     """ Function that extract the GPE entities of a plot summary
#     params: a text called 'summary' and a set of US locations containing the cities, counties and the states
#     returns: all the GPE entities and all the US GPE frequencies detected in the summary
#     """
#     doc = nlp(summary)
#     GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
#     GPE_frequencies_all = Counter(GPE_entities)
#     GPE_US_frequencies = Counter(entity for entity in GPE_entities if entity in set_us_location)
#     Percentage_US_culture = sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values()) if sum(GPE_frequencies_all.values()) > 0 else 0
#     return GPE_entities, GPE_US_frequencies, Percentage_US_culture

# # Unpacking the results of the function
# df_plots_us_partially_movies_GPE['GPE_entities'], df_plots_us_partially_movies_GPE['GPE_US_frequencies'], df_plots_us_partially_movies_GPE['Percentage_american_culture'] = df_plots_us_partially_movies_GPE['Summary'].apply(extract_GPE_and_compute_US_frequencies)


# # df_plots_us_partially_movies_GPE['GPE_frequencies'] = df_plots_us_partially_movies_GPE['GPE_entities'].apply(
# #     lambda entities: Counter(entities) # use of library Counzter for efficient processing
# # ) # ENLEVER POUR LA SUITE



# # Filter GPE entities to retain only those matching US locations
# # df_plots_us_partially_movies_GPE['GPE_US_frequencies'] = df_plots_us_partially_movies_GPE['GPE_entities'].apply(
# #     lambda entities: Counter(entity for entity in entities if entity in us_locations_set)
# # )

# # "Percentage" of american culture
# # df_plots_us_partially_movies_GPE['Percentage_american_culture'] = df_plots_us_partially_movies_GPE.apply(
# #     lambda row: sum(row['GPE_US_frequencies'].values()) / sum(row['GPE_frequencies'].values())
# #     if sum(row['GPE_frequencies'].values()) > 0 else 0,
# #     axis=1
# # )

# df_plots_us_partially_movies_GPE.head()


# mean_amercian_culture_us_partially_movies_GPE = df_plots_us_partially_movies_GPE['Percentage_american_culture'].mean()

# print(f"The mean percenatge of amercian culture of partially US-produced movies based on location only is {mean_amercian_culture_us_partially_movies_GPE*100:.2f} %.")