In [None]:
# Creating deep copy of the plots_summaries dataframes creatde above 
df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

# Verify if laptop running the script has a GPU and CUDA enabled
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu() # enable GPU use for Spacy, taken from: https://stackoverflow.com/questions/75355264/how-to-enable-cuda-gpu-acceleration-for-spacy-on-windows 
    nlp = spacy.load("en_core_web_trf") # uses a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# 
def extract_GPE_and_compute_US_frequencies_batch(summaries, us_locations):
    """
    Function that implement batch processing of plot summaries and extract GPE entities and frequencies. If available on the machine running the notebook, CUDA is enabled for faster processing.
    params: texts called 'summaries' and a set of US locations containing the cities, counties and the states
    returns: all the GPE entities and all the US GPE frequencies detected in the summary as a list
    """
    results = []
    for doc in nlp.pipe(summaries, batch_size=500):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        GPE_US_frequencies = Counter(ent for ent in GPE_entities if ent in us_locations)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Apply batch processing with GPU
df_plots_us_partially_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = \
    pd.DataFrame(extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_partially_movies_GPE['Summary'].tolist(), us_locations_set # converts a Pandas Series into a Python list, as nlp.pipe expects a seqeunce of string according to the documentation https://spacy.io/api/language#pipe 
    ))

df_us_partially_movies_NLP_GPE = df_plots_us_partially_movies_GPE.drop(columns=['Summary']) # keeping all the info of the original df, the new GPE columns and dropping the summaries for more clarity and later use

df_us_partially_movies_NLP_GPE.to_csv("data/us_partially_movies_NLP_GPE.csv", sep=',', encoding='utf-8', index=False, header = True)# hard code the encoding to avoid anay problems as seen in the lecture

print("NLP processing done.")

# Old GPE tests

List of US states Github

In [None]:
# List of all us states, counties and cities found on: https://github.com/grammakov/USA-cities-and-states/tree/master
df_us_states_counties_cities = pd.read_csv("data/us_cities_states_counties.csv", sep = "|")
print(f"The shape of the US states, counties and cities dataset is {df_us_states_counties_cities.shape}.")

# Exclude non-offcial US states that are comprised in the .csv from the Github page
exclude_states = [
    "US Armed Forces Pacific", "American Samoa", "Guam", "Palau",
    "Federated States of Micronesia", "Northern Mariana Islands",
    "Marshall Islands", "US Armed Forces Europe", "Puerto Rico", "Virgin Islands"
]

indices_to_drop = df_us_states_counties_cities[
    df_us_states_counties_cities['State full'].isin(exclude_states)
].index


df_us_states_counties_cities.drop(index=indices_to_drop, inplace=True)
print(f"The shape after dropping of the US states, counties and cities dataset is {df_us_states_counties_cities.shape}.")

# Create lists of unique city, county ans states names
list_US_states = list(df_us_states_counties_cities['State full'].unique())
list_US_counties = list(df_us_states_counties_cities['County'].str.capitalize().unique())
list_US_cities = list(df_us_states_counties_cities['City'].unique())

# Print the shapes and display only the first 10 locations of each list to avoid too long print statements
print(f"The list of the first ten US states is \n {list_US_states[:10]} \n and contains {len(list_US_states)} states.")
print(f"The list of the first ten US counties is \n {list_US_counties[:10]} \n and contains {len(list_US_counties)} counties.")
print(f"The list of the first ten US cities is \n {list_US_cities[:10]} \n and contains {len(list_US_cities)} cities.")

# Combine all US states, counties, and cities into a single set for faster lookup
us_locations_set = set(list_US_states + list_US_counties + list_US_cities)

df_us_states_counties_cities.head()

In [None]:
df_plots_us_only_movies_GPE = plot_summaries_us_movies.copy()

start_time = time.time()
print("Processing starting.")

# Verify if laptop running the script has a GPU and CUDA enabled
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu() # enable GPU use for Spacy, taken from: https://stackoverflow.com/questions/75355264/how-to-enable-cuda-gpu-acceleration-for-spacy-on-windows 
    nlp = spacy.load("en_core_web_trf") # uses a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# 
def extract_GPE_and_compute_US_frequencies_batch(summaries, us_locations):
    """
    Function that implement batch processing of plot summaries and extract GPE entities and frequencies. If available on the machine running the notebook, CUDA is enabled for faster processing.
    params: texts called 'summaries' and a set of US locations containing the cities, counties and the states
    returns: all the GPE entities and all the US GPE frequencies detected in the summary as a list
    """
    results = []
    for doc in nlp.pipe(summaries, batch_size=200):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        GPE_US_frequencies = Counter(ent for ent in GPE_entities if ent in us_locations)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Apply batch processing with GPU
df_plots_us_only_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = \
    pd.DataFrame(extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_only_movies_GPE['Summary'].tolist(), us_locations_set # converts a Pandas Series into a Python list, as nlp.pipe expects a seqeunce of string according to the documentation https://spacy.io/api/language#pipe 
    ))

df_us_only_movies_NLP_GPE = df_plots_us_only_movies_GPE.drop(columns=['Summary']) # keeping all the info of the original df, the new GPE columns and dropping the summaries for more clarity and later use

df_us_only_movies_NLP_GPE.to_csv("data/us_only_movies_NLP_GPE.csv", sep=',', encoding='utf-8', index=False, header = True)# hard code the encoding to avoid anay problems as seen in the lecture

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(geocode_location, locations), total=len(locations), desc="Geocoding Locations"))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Apply batch processing with GPU
df_plots_us_partially_movies_GPE[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
    extract_GPE_and_compute_US_frequencies_batch(
        df_plots_us_partially_movies_GPE['Summary'].tolist()
    ))

# Drop summaries and save results
df_us_partially_movies_NLP_GPE = df_plots_us_partially_movies_GPE.drop(columns=['Summary'])
df_us_partially_movies_NLP_GPE.to_csv("data/NLP_datasets/us_partially_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")


# TEST AVEC NOMINATIM Sur les plots problÃ©matiques

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers (cf. https://spacy.io/models/en/)
    print("Using Spacy English transformer pipeline.")
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")
    print("Using Spacy English pipeline optimized for CPU.")


# Initialize caching dictionary
geocode_cache = {}

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Cached geocoding function tio avoid looking sevarl time for the same location of the geopy API
def geocode_location_cached(location):
    if location not in geocode_cache:
        for attempt in range(3):  # Retry up to 3 times
            try:
                time.sleep(1)  # Respect API rate limits
                geocode_cache[location] = geocode_location(location)
                break
            except GeocoderRateLimited:
                print(f"Rate-limited for location: {location}. Retrying...")
                time.sleep(10)  # Wait before retrying
    return geocode_cache[location]

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(geocode_location_cached, locations))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Filter DataFrame for selected movie titles
selected_titles_2 = ["The Hunger Games", "Dark Water", "Meet John Doe","Exodus"] # 
filtered_df_2 = plot_summaries_us_movies[plot_summaries_us_movies['title'].isin(selected_titles_2)].copy()

# Apply batch processing with GPU
# filtered_df[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
#     extract_GPE_and_compute_US_frequencies_batch(
#         filtered_df['Summary'].tolist()
#     ))

extracted_results = extract_GPE_and_compute_US_frequencies_batch(filtered_df_2['Summary'].tolist())
filtered_df_2['GPE_entities'] = [res[0] for res in extracted_results]
filtered_df_2['GPE_US_frequencies'] = [res[1] for res in extracted_results]
filtered_df_2['Percentage_american_culture'] = [res[2] for res in extracted_results]


# Drop summaries and save results
filtered_df_2 = filtered_df_2.drop(columns=['Summary'])
# filtered_df.drop(columns=['Summary'], inplace=True)
filtered_df_2.to_csv("data/NLP_datasets/us_only_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

filtered_df_2.head()


In [None]:

if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    spacy.require_gpu()  # Enable GPU use for SpaCy
    nlp = spacy.load("en_core_web_trf")  # Use a more complex model leveraging transformers (cf. https://spacy.io/models/en/)
    print("Using Spacy English transformer pipeline.")
else:
    print("CUDA is NOT available. Using CPU.")
    nlp = spacy.load("en_core_web_sm")
    print("Using Spacy English pipeline optimized for CPU.")


# Initialize caching dictionary
geocode_cache = {}

# Geocode a single location and check if it's in the US
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    locate = geolocator.geocode(location)
    check = True if locate and 'United States' in str(locate) else False
    return location, check

# Cached geocoding function tio avoid looking sevarl time for the same location of the geopy API
def geocode_location_cached(location):
    if location not in geocode_cache:
        geocode_cache[location] = geocode_location(location)
    return geocode_cache[location]

# Geocode multiple locations in parallel
def geocode_locations_parallel(locations):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(geocode_location_cached, locations))
    return results

# Extract GPE entities and compute US frequencies with geocoding
def extract_GPE_and_compute_US_frequencies_batch(summaries):
    results = []
    for doc in tqdm(nlp.pipe(summaries, batch_size=500), total=len(summaries), desc="Processing Summaries"):
        GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        GPE_frequencies_all = Counter(GPE_entities)
        geocoded_results = geocode_locations_parallel(GPE_entities)
        GPE_US_frequencies = Counter(location for location, is_in_us in geocoded_results if is_in_us)
        Percentage_US_culture = (
            sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values())
            if sum(GPE_frequencies_all.values()) > 0 else 0
        )
        results.append((GPE_entities, GPE_US_frequencies, Percentage_US_culture))
    return results

# Example DataFrame setup and processing
# Assuming df_plots_us_partially_movies_GPE is already defined
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy()

start_time = time.time()
print("Processing starting.")


# Filter DataFrame for selected movie titles
selected_titles = ["Come Back, Africa", "A Cry in the Dark", "End Game","Eastern Promises",  "Sophie's Choice"] # 
filtered_df = plot_summaries_us_partially_movies[plot_summaries_us_partially_movies['title'].isin(selected_titles)].copy()

# Apply batch processing with GPU
# filtered_df[['GPE_entities', 'GPE_US_frequencies', 'Percentage_american_culture']] = pd.DataFrame(
#     extract_GPE_and_compute_US_frequencies_batch(
#         filtered_df['Summary'].tolist()
#     ))

extracted_results = extract_GPE_and_compute_US_frequencies_batch(filtered_df['Summary'].tolist())
filtered_df['GPE_entities'] = [res[0] for res in extracted_results]
filtered_df['GPE_US_frequencies'] = [res[1] for res in extracted_results]
filtered_df['Percentage_american_culture'] = [res[2] for res in extracted_results]


# Drop summaries and save results
filtered_df = filtered_df.drop(columns=['Summary'])
# filtered_df.drop(columns=['Summary'], inplace=True)
filtered_df.to_csv("data/NLP_datasets/us_partially_movies_NLP_GPE_test_problems.csv", sep=',', encoding='utf-8', index=False, header=True)


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

print(filtered_df.columns)  
filtered_df.head()


# Test avec small spacy model 'en_core_sm'

In [None]:
# nlp = spacy.load('en_core_web_sm')

# # Creating deep copy of the plots_summaries dataframes creatde above 
# df_plots_all_movies_GPE = plot_summaries_all_movies.copy()
# df_plots_us_movies_GPE = plot_summaries_us_movies.copy()
# df_plots_us_partially_movies_GPE = plot_summaries_us_partially_movies.copy() # run en 5'30"
# df_plots_RoW_GPE = plot_summaries_RoW_movies.copy()

# # since GPE is part of the NER spacy pipeline, no need to tokenize manually the text before implementing the search for GPE quantities
# # def extract_gpe(summary):
# #     """ Function that extract the GPE entities of a plot summary
# #     params: a text called 'summary'
# #     returns: all the GPE entities detected in the summary
# #     """
# #     doc = nlp(summary)
# #     return [ent.text for ent in doc.ents if ent.label_ == 'GPE'] # outputs all detected geopolitical entities, including repeated mentions of the same geographical entitities

# # def compute_US_GPE_frequencies(summary, set_us_location):
# #     """ Function that extract the US GPE frequencies of a plot summary
# #     params: a text called 'summary' and a set of US locations containing the cities, counties and the states
# #     returns: all the US GPE frequencies detected in the summary
# #     """

# def extract_GPE_and_compute_US_frequencies(summary, set_us_location):
#     """ Function that extract the GPE entities of a plot summary
#     params: a text called 'summary' and a set of US locations containing the cities, counties and the states
#     returns: all the GPE entities and all the US GPE frequencies detected in the summary
#     """
#     doc = nlp(summary)
#     GPE_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
#     GPE_frequencies_all = Counter(GPE_entities)
#     GPE_US_frequencies = Counter(entity for entity in GPE_entities if entity in set_us_location)
#     Percentage_US_culture = sum(GPE_US_frequencies.values()) / sum(GPE_frequencies_all.values()) if sum(GPE_frequencies_all.values()) > 0 else 0
#     return GPE_entities, GPE_US_frequencies, Percentage_US_culture

# # Unpacking the results of the function
# df_plots_us_partially_movies_GPE['GPE_entities'], df_plots_us_partially_movies_GPE['GPE_US_frequencies'], df_plots_us_partially_movies_GPE['Percentage_american_culture'] = df_plots_us_partially_movies_GPE['Summary'].apply(extract_GPE_and_compute_US_frequencies)


# # df_plots_us_partially_movies_GPE['GPE_frequencies'] = df_plots_us_partially_movies_GPE['GPE_entities'].apply(
# #     lambda entities: Counter(entities) # use of library Counzter for efficient processing
# # ) # ENLEVER POUR LA SUITE



# # Filter GPE entities to retain only those matching US locations
# # df_plots_us_partially_movies_GPE['GPE_US_frequencies'] = df_plots_us_partially_movies_GPE['GPE_entities'].apply(
# #     lambda entities: Counter(entity for entity in entities if entity in us_locations_set)
# # )

# # "Percentage" of american culture
# # df_plots_us_partially_movies_GPE['Percentage_american_culture'] = df_plots_us_partially_movies_GPE.apply(
# #     lambda row: sum(row['GPE_US_frequencies'].values()) / sum(row['GPE_frequencies'].values())
# #     if sum(row['GPE_frequencies'].values()) > 0 else 0,
# #     axis=1
# # )

# df_plots_us_partially_movies_GPE.head()


# mean_amercian_culture_us_partially_movies_GPE = df_plots_us_partially_movies_GPE['Percentage_american_culture'].mean()

# print(f"The mean percenatge of amercian culture of partially US-produced movies based on location only is {mean_amercian_culture_us_partially_movies_GPE*100:.2f} %.")

# OLD US lexicon

## Creation of US lexicon

In [None]:
# import wikipedia # to retrieve wikipedia page text content
# import spacy # to implement NLP on the wikipedia page text

# # Initialize the Spacy analyzer in English since all the wikipedia pages are analysed in English
# nlp = spacy.load("en_core_web_sm")

# # Function to process a Wikipedia page
# def process_page(page_name):
#     page_content = wikipedia.page(page_name, auto_suggest=False).content.replace('==', '').replace('\n', '')
#     doc = nlp(page_content) # tokenizing each page
#     return [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha] # removing stopwords and non alphabetic characters (with .is_alpha) and lemmatize the text to discard close form of the same word

# # Process each page separately for verification and clarity purposes
# us_words = process_page('United States') # https://en.wikipedia.org/wiki/United_States 
# #fr_words = process_page('France') # https://en.wikipedia.org/wiki/France 
# uk_words = process_page('United Kingdom') # https://en.wikipedia.org/wiki/United_Kingdom 
# de_words = process_page('Germany') # https://en.wikipedia.org/wiki/Germany 
# #it_words = process_page('Italy') # https://en.wikipedia.org/wiki/Italy 
# #jp_words = process_page('Japan') # https://en.wikipedia.org/wiki/Japan 
# #ch_words = process_page('Switzerland') # https://en.wikipedia.org/wiki/Switzerland 
# ir_words = process_page('Ireland') # https://en.wikipedia.org/wiki/Ireland --> since Ireland has a strong impact on amercican culture
# cn_words = process_page('Canada') # https://en.wikipedia.org/wiki/Canada 
# nz_words = process_page('New Zealand') # https://en.wikipedia.org/wiki/New_Zealand

# # Combine words from FR, UK, DE and keep only the unique ones for faster processing
# other_words = set( uk_words + de_words + ir_words + cn_words + nz_words)

# # Extract unique US words using the set() function
# unique_us_words = set(us_words) - other_words

# print(f"The list of unique US words is: \n {unique_us_words}")
# print(f"Unique US words: {len(unique_us_words)}")
# # https://www.britannica.com/place/United-States
# # https://en.wikipedia.org/wiki/Culture_of_the_United_States 
# # intersection plutot que soustraction


# list_straightforward_American_words = ['hollywood', 'cowboy', 'thanksgiving', 'donut', 'broadway', 'sheriff', 'mcdonald', 'doughnut', 'hamburger', 'pentagon', 'halloween', 'usa', 'U.S.']

# words_in_list = []
# words_not_in_list = []

# for amercian_word in list_straightforward_American_words:
#     if amercian_word in unique_us_words:
#         words_in_list.append(amercian_word)
#     else:
#         words_not_in_list.append(amercian_word)

# print(f"The words that are both in the US wikipedia list and in the simple straightforward american list are {words_in_list}")
# print(f"The words that are in the US wikipedia list but NOT in the simple straightforward american list are {words_not_in_list}")

In [None]:
from spacy.matcher import PhraseMatcher

# Load spaCy model
nlp = spacy.load("en_core_web_trf")

# Define multi-word lexicon
multi_word_lexicon = {"muscle car", "slam dunk"}

# Create PhraseMatcher and add patterns
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(term) for term in multi_word_lexicon]
matcher.add("US_TERMS", patterns)

# Example sentence
text = "I went to the slam dunk with my muscle car."
doc = nlp(text)

# Find matches
matches = matcher(doc)
matched_terms = [doc[start:end].text.lower() for match_id, start, end in matches]
print(matched_terms)  # Output: ['slam dunk', 'muscle car']

In [None]:
# Test
nlp = spacy.load('en_core_web_trf')

text = "I went with my cool friends to a Thanksgiving party after electoral college :) !! We then headed back home by taking Route 66. Indeed next day we will have electoral day in the Silicon valley."
doc = nlp(text)

summary_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
print(summary_tokens)

In [None]:
from geopy.geocoders import Nominatim

# Define the geocoding function
def geocode_location(location):
    geolocator = Nominatim(user_agent="location_disambiguator", timeout=10)
    try:
        locate = geolocator.geocode(location)
        if locate and 'United States' in str(locate.address):
            return location, True
        else:
            return location, False
    except Exception as e:
        # Handle any exceptions (e.g., network issues)
        print(f"Error geocoding {location}: {e}")
        return location, False

# List of locations to check
locations = [
    "wall street", "grand canyon", "times square", "alcatraz", "las vegas strip", 
    "mississippi river", , "brooklyn bridge", "mount rushmore", 
    "mount vernon", "rockefeller center"
]

# Check each location
results = [geocode_location(location) for location in locations]

# Print results
for location, is_us in results:
    print(f"{location}: {'Detected in the US' if is_us else 'Not detected in the US'}")

In [None]:
# def compute_US_lexicon_frequencies_batch(summaries, us_lexicon):
#     """
#     Function that implement batch processing of plot summaries and extract GPE entities and frequencies. If available on the machine running the notebook, CUDA is enabled for faster processing.
#     params: texts called 'summaries' and a set of US locations containing the cities, counties and the states
#     returns: all the token entities and all the US token frequencies detected in the summary as a list
#     """
#     results = []
#     for doc in nlp.pipe(summaries, batch_size=500):
#         summary_tokens = [token.text.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha] # lemmatize, lower all letters, remove stopwords and numbers
#         Tokens_frequencies_all = Counter(summary_tokens)
#         Tokens_US_frequencies = Counter(token for token in summary_tokens if token in us_lexicon)
#         Percentage_US_culture_lexicon = (
#             sum(Tokens_US_frequencies.values()) / sum(Tokens_frequencies_all.values())
#             if sum(Tokens_frequencies_all.values()) > 0 else 0
#         )
#         results.append((Tokens_frequencies_all, Tokens_US_frequencies, Percentage_US_culture_lexicon))
#     return results

# Test US lexicon

In [None]:
start_time = time.time()
print("Processing starting.")


# Filter DataFrame for selected movie titles
selected_titles_2 = ["The Hunger Games", "Dark Water", "Meet John Doe","Exodus", "Saboteur", "College road Trip", "City Hall"] # 
filtered_df_2 = plot_summaries_us_movies[plot_summaries_us_movies['title'].isin(selected_titles_2)].copy()

# Apply batch processing with GPU
extracted_results = compute_US_lexicon_frequencies_batch(filtered_df_2['Summary'].tolist(), unique_us_words_set)
filtered_df_2['Total_tokens'] = [res[0] for res in extracted_results]
filtered_df_2['Token_US_frequencies'] = [res[1] for res in extracted_results]
filtered_df_2['Percentage_american_culture_lexicon'] = [res[2] for res in extracted_results]



filtered_df_2 = filtered_df_2.drop(columns=['Summary']) # keeping all the info of the original df, the new GPE columns and dropping the summaries for more clarity and later use

filtered_df_2.to_csv("data/NLP_datasets/NLP_US_lexicon/NLP_US_lexicon_testUSOnly.csv", sep=',', encoding='utf-8', index=False, header = True)# hard code the encoding to avoid anay problems as seen in the lecture

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time/60:.2f} minutes. NLP processing done.")

filtered_df_2.head(6)

NLP very first tries

In [None]:
# Disable parser and NER for better efficiency
# Indeed the nlp function from spacy implements the whole NLP piepeline so i involves a lot of diffrent step as the following ones: "tagger", "parser", "ner", etc.
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def tokenize_and_count(df):
    """
    Function that counts the number of tokens per plot summaries 
    Params: df (pd.DataFrame): DataFrame containing a column 'Summary'.
    Returns: The count of tokens for the given df.
    """
    if 'Summary' not in df.columns:
        raise ValueError("The dataframe must have a column named 'Summary'.")
    
    # nlp.pipe for better efficiency since there are at maximum about 42'300 plot summaries to be processed
    # it allows for parrallelize computing using batches
    tokens = [token.text for doc in nlp.pipe(df['Summary'], batch_size=1000) for token in doc] 
    token_counts = Counter(tokens)
    return token_counts

all_movies_tokens = tokenize_and_count(plot_summaries_all_movies)
us_only_tokens = tokenize_and_count(plot_summaries_us_movies)
partial_us_tokens = tokenize_and_count(plot_summaries_us_partially_movies)
rest_world_tokens = tokenize_and_count(plot_summaries_RoW_movies)

# Semantic analysis based on lexical categories
from empath import Empath
lexicon = Empath()

# pre-build categores
for cat in list(lexicon.cats.keys())[:15]:
    print(cat) # returns words linked with cat 

# examine representative terms for each category
lexicon.cats["health"][:15]

# analyse a whole book
empath_features = lexicon.analyze(doc.text,categories = ["disappointment", "pain", "joy", "beauty", "affection"]) # returns the number of each specified categories features in the book

# create ustom categories
# my list to test

lexicon.create_category("american_culture", ["New York", "burger", "guns", "Whashington", "cowboys", ""], model="nytimes") # model = "nytimes" (New York Times), "fiction"or "reddit"