# All useful functions called in `results`

In [None]:
import re
import pycountry
import pycountry_convert as pc
#from country_named_entity_recognition import find_countries
import requests
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import numpy as np

In [2]:
def extract_string(string):
    """
    Function to extract all words between quotes

    Parameters: string (str) - a string containing words between quotes

    Returns: between_quotes (list) - a list of words between quotes
    """
    # The re library allows to extract all words between quotes
    between_quotes = re.findall(r'"(.*?)"', string)
    # Suppressing all words starting with /, as they are not country names
    for word in between_quotes:
        if word[0] == '/':
            between_quotes.remove(word)
    # Returning the list of words between quotes
    return between_quotes

In [3]:
def estimate_release_year(row, mean_release_year_by_genre):
    """
    Function to estimate the release year of a movie based on its genres (by taking the average of the release years of movies with the same genres)

    Parameters: row (pd.Series) - a row of a pandas DataFrame

    Returns: release_year (int) - the estimated release year of the movie
    """
    if pd.isna(row['Movie_release_date']):
        genres = row['genre_list']
        mean_years = [mean_release_year_by_genre[genre] for genre in genres if genre in mean_release_year_by_genre]
        if mean_years:
            return sum(mean_years) / len(mean_years)
        else:
            return pd.NA
    else:
        return row['Movie_release_date']

In [4]:
def count_countries(string):
    """
    Function to count the number of countries in a string

    Parameters: string (str) - a string containing countries separated by commas

    Returns: country_number (int) - the number of countries
    """
    # Counting of comas in the string
    country_number=string.count(',') +1 
    # Returning the number of countries
    return country_number

In [5]:
def is_dict_string(s):
    """
    Vérifie si une chaîne de caractères peut être convertie en dictionnaire.
    
    Paramètres:
    s (str): La chaîne de caractères à vérifier.
    
    Retourne:
    bool: True si la chaîne peut être convertie en dictionnaire, False sinon.
    """
    try:
        ast.literal_eval(s)
        return True
    except (ValueError, SyntaxError):
        return False

In [6]:
def country_to_continent(country_name):
    try:
        # Use pycountry to lookup alpha2 code, then map to continent
        country = pycountry.countries.lookup(country_name)
        country_alpha2 = country.alpha_2
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except LookupError:
        print(f"Country not found: {country_name}")
        return None
    except Exception as e:
        print(f"Error converting country to continent: {country_name}, {e}")
        return None

In [7]:
def get_word_count(url, retries=3, delay=5):
    for attempt in range(retries):
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the main content of the page
        content = soup.find('div', {'id': 'mw-content-text'})

        if content:
            # Extract text and remove citations
            text = re.sub(r'\[\d+\]', '', content.get_text())
            
            # Count the number of words
            words = text.split()
            return len(words)
        else:
            return None


In [8]:
# Function to preprocess event labels
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    return text

In [9]:
# Function to get the best matching countries
def get_countries(event, country_names, dict_countries):
    event = preprocess_text(event)
    countries = set()
    
    # Check for exact matches with country names
    for country in country_names:
        if country.lower() in event:
            countries.add(country)
    
    # Check for matches with demonyms
    for country, country_list in dict_countries.items():
        if country in event:
            countries.update(country_list)
    
    return sorted(countries)

In [10]:
def text_split(text) :

    #Remove from text all punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    #Tokenise the text in words and convert in small letters
    words = word_tokenize(text)
    
    #Filter the words depending on their grammatical class
    
    text_splited = []

    for word in words:

        #Find the grammatical class (pos) of a word and tag it with it
        pos = pos_tag([word])[0][1]
        #Keep only the nouns (singular and plural)
        if pos in ['NN', 'NNS']: #and word not in stop_words:
            text_splited.append(word)
    
    return text_splited

In [11]:
def bag_of_words(text_splited) : 
    
    #Create a dictionary mapping each word to a unique id
    dictionary = Dictionary([text_splited])
    
    
    corpus = dictionary.doc2bow(text_splited)
    
    return corpus, dictionary

In [12]:
def lda(df_line) : 
    corpus, dictionary = df_line['corpus']
    
    lda_model = LdaModel([corpus], num_topics=1, id2word=dictionary)
    return lda_model

In [13]:
def get_topics(lda_model):
    
    #provid fifty words belonging to a single topic
    return lda_model.show_topics(num_topics=1, num_words=50, formatted=True)
    
    topic_words = []

    for topic in topics:
        for word, freq in topic[1]:
            topic_words.append({'topic': topic[0], 'word': word, 'frequency': freq})

    return topic_words

In [14]:
def extract(data):

    serie = data[0][0][1] 
    
    #Get the words and their weight from the serie
    matches = re.findall(r'([\d.]+)\*"(.*?)"', serie) 
    freq_df = pd.DataFrame(matches, columns=["freq", "word"])

    #Convert the frequency in floats
    freq_df["freq"] = freq_df["freq"].astype(float) 
    
    return freq_df

In [15]:
def text_split_new(text) :

    #Initialize a set of frequent and not relevent words in english
    #stop_words = set(stopwords.words('english'))

    #Remove from text all punctuations
    text = re.sub(r'[^\w\s]', '', text)
    

    #Tokenise the text in words and convert in small letters
    words = word_tokenize(text)
    
    #Filter the words depending on their grammatical class
    
    text_splited = []

    for word in words:

        #Find the grammatical class (pos) of a word and tag it with it
        pos = pos_tag([word])[0][1]
        #Keep only the nouns (singular and plural)
        if pos in ['NN', 'NNS']: #and word not in stop_words:
            text_splited.append(word)
    
    return text_splited

In [16]:
def lda_new(df_line):
    corpus, dictionary = df_line  # Unpacking the tuple directly
    lda_model = LdaModel([corpus], num_topics=1, id2word=dictionary)
    return lda_model

In [17]:

def get_topics_new(lda_model):
    #provid fifty words belonging to a single topic
    return lda_model.show_topics(num_topics=1, num_words=50, formatted=True)


In [18]:
def extract_new(topics_tuple):
    topics = []
    
    #Get the words and their weight from the serie
    pattern = re.compile(r'(\d+\.\d+)\*"(.*?)"')
    for topic_id, topics_str in topics_tuple:
        matches = pattern.findall(topics_str)
        for match in matches:
            freq = float(match[0]) #Convert the frequency in floats
            word = match[1]
            topics.append((word, freq))
    return topics

In [None]:
# Function to read and preprocess text data
def read_and_preprocess(file_path):
    """
    Reads a text file, removes newline characters, and tokenizes it into sentences.
    
    Args:
        file_path (str): Path to the input text file.
    
    Returns:
        list: A list of stripped sentences.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text=text.replace('\n','')

    # Tokenize the text into sentences using NLTK
    sentences = nltk.sent_tokenize(text)  
    return [sentence.strip() for sentence in sentences if sentence.strip()] 

In [None]:
def extract_higher_correlations(decade, threshold):
    """
    Processes higher correlations for a given decade.

    Args:
        decade (int): The decade to process.
        threshold (float): The minimum correlation value to include.

    Returns:
        list: A list of dictionaries containing the decade, clusters, and correlation values that exceed the threshold.
    """
    
    file_path = f'src/data/heatmaps_data/heatmap_data_{decade}.csv'
    df = pd.read_csv(file_path, index_col=0)

    # Convert all columns to numeric, handling errors
    df = df.apply(pd.to_numeric, errors='coerce')

    # Flatten the dataframe and filter correlations greater than the threshold
    filtered_df = df.stack()
    filtered_df = filtered_df[filtered_df > threshold]

    # Create a list of dictionaries with the results
    correlations = [
        {
            'Decade': decade,
            'Movies cluster': row_label,
            'History cluster': col_label,
            'Correlation': correlation_value
        }
        for (row_label, col_label), correlation_value in filtered_df.items()
    ]

    return correlations

In [None]:
def extract_max_similarity(decade):
    """
    Extract the maximum similarity for a given decade.

    Args:
        decade (int): The decade to process.

    Returns:
        dict: A dictionary containing the decade, the history cluster name with the maximum similarity, and the similarity value.
    """
    
    file_path = f'src/data/max_similarity_plots_data/bar_plot_data_{decade}.csv'
    df = pd.read_csv(file_path, index_col=0)

    # Convert data to numeric, handling any errors
    df = df.apply(pd.to_numeric, errors='coerce')

    # Find the maximum similarity value
    max_value = df.max().max()

    # Locate the row and column labels for the maximum value
    row_label, col_label = np.unravel_index(df.values.argmax(), df.shape)

    # Return a dictionary with the result
    return {
        'Decade': decade,
        'History cluster': df.index[row_label],
        'Similarity': max_value
    }
