### Neural Network Model



In [10]:
import sys
import os

project_root = os.path.abspath(
    os.path.join(os.getcwd(), "../../..")
)  # Adjust '..' if your notebook is deeper
if project_root not in sys.path:
    sys.path.append(project_root)

In [22]:
import nltk

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download("punkt")

try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download("stopwords")

# Add this check and download for punkt_tab
try:
    # Check for the specific English directory within punkt_tab
    nltk.data.find("tokenizers/punkt_tab/english/")
except LookupError:
    print("Downloading NLTK 'punkt_tab'...")
    nltk.download("punkt_tab")

print("NLTK resources checked/downloaded.")

Downloading NLTK 'punkt_tab'...


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/cerdricdamais/nltk_data...


NLTK resources checked/downloaded.


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [19]:
import pandas as pd
from models.src.preprocessing.tokenizer import tokenize
from data.process_data_modeling import get_processed_data
import os
import logging
import nltk

nltk.download("punkt")
nltk.download("stopwords")

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cerdricdamais/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cerdricdamais/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
df = get_processed_data()
df.head()

2025-04-28 16:03:32,772 - INFO - Importing data from Kaggle
2025-04-28 16:03:32,774 - INFO - Path to dataset files: /Users/cerdricdamais/.cache/kagglehub/datasets/arshkon/linkedin-job-postings/versions/13
2025-04-28 16:03:32,775 - INFO - List of files in the dataset: ['postings.csv', 'mappings', 'jobs', 'companies']
2025-04-28 16:03:36,984 - INFO - Rows with at least one NaN value: 1725
2025-04-28 16:03:36,990 - INFO - Number of rows before dropping NaN values: 123849
2025-04-28 16:03:37,017 - INFO - Number of rows after dropping NaN values: 122124
2025-04-28 16:04:15,273 - INFO - DataFrame saved to: /Users/cerdricdamais/Desktop/EPITA/MAJEUR/NLP-1/NLP_Linkedin_offers/models/src/generation/data/processed/cleaned_postings_modeling.parquet
2025-04-28 16:04:15,274 - INFO - Data processing completed successfully !
2025-04-28 16:04:15,275 - INFO - Returning the processed DataFrame


Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,job descriptiona leading real estate firm in n...,"Princeton, NJ"
2,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH"
3,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY"
5,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summarythe economic development planning i...,"Raleigh, NC"
6,Raw Cereal,Producer,company descriptionraw cereal is a creative de...,United States


In [23]:
def tokenize_data_frame(df, column_names: list, method="nltk", remove_stopwords=False):
    """
    Tokenize a column of a DataFrame, and then add the tokenized column to the DataFrame.
    the name of the new column is the name of the original column with "_tokenized" suffix.
    """
    for column_name in column_names:
        logging.info(f"Tokenizing column: {column_name}")
        logging.info(f"Method: {method}")
        column_name_tokenized = column_name + "_tokenized"
        logging.info(f"New column name: {column_name_tokenized}")
        df[column_name_tokenized] = df[column_name].apply(
            lambda x: tokenize(x, method, remove_stopwords)
        )
    return df


# Tokenzation of the text
columns_to_tokenize = ["description", "company_name", "title", "location"]
tokenized_df = tokenize_data_frame(df, columns_to_tokenize)
tokenized_df.head()

2025-04-28 16:05:51,665 - INFO - Tokenizing column: description
2025-04-28 16:05:51,667 - INFO - Method: nltk
2025-04-28 16:05:51,667 - INFO - New column name: description_tokenized
2025-04-28 16:08:15,162 - INFO - Tokenizing column: company_name
2025-04-28 16:08:15,170 - INFO - Method: nltk
2025-04-28 16:08:15,171 - INFO - New column name: company_name_tokenized
2025-04-28 16:08:20,716 - INFO - Tokenizing column: title
2025-04-28 16:08:20,718 - INFO - Method: nltk
2025-04-28 16:08:20,718 - INFO - New column name: title_tokenized
2025-04-28 16:08:26,661 - INFO - Tokenizing column: location
2025-04-28 16:08:26,662 - INFO - Method: nltk
2025-04-28 16:08:26,662 - INFO - New column name: location_tokenized


Unnamed: 0,company_name,title,description,location,description_tokenized,company_name_tokenized,title_tokenized,location_tokenized
0,Corcoran Sawyer Smith,Marketing Coordinator,job descriptiona leading real estate firm in n...,"Princeton, NJ","[job, descriptiona, leading, real, estate, fir...","[Corcoran, Sawyer, Smith]","[Marketing, Coordinator]","[Princeton, ,, NJ]"
2,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH","[the, national, exemplar, is, accepting, appli...","[The, National, Exemplar]","[Assitant, Restaurant, Manager]","[Cincinnati, ,, OH]"
3,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY","[senior, associate, attorney, elder, law, trus...","[Abrams, Fensterman, ,, LLP]","[Senior, Elder, Law, /, Trusts, and, Estates, ...","[New, Hyde, Park, ,, NY]"
5,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summarythe economic development planning i...,"Raleigh, NC","[job, summarythe, economic, development, plann...","[Downtown, Raleigh, Alliance]","[Economic, Development, and, Planning, Intern]","[Raleigh, ,, NC]"
6,Raw Cereal,Producer,company descriptionraw cereal is a creative de...,United States,"[company, descriptionraw, cereal, is, a, creat...","[Raw, Cereal]",[Producer],"[United, States]"


### Text Data Analyzer

- Perform statistical analysis on the dataset for this dataset it will help for the task that is text generation

In [44]:
class TextDataAnalyzer:
    """
    Exploratory analysis of the data (statistics, classes, tokens, etc.)
    """

    def __init__(self, df: pd.DataFrame):
        self.df = df

    def total_number_of_documents(self):
        """
        Return the total number of documents in the dataset
        """
        return len(self.df)

    def total_number_of_unique_tokens(self):
        """Calculates the total number of unique tokens across specified tokenized columns."""

        tokenized_columns = [
            col for col in self.df.columns if col.endswith("_tokenized")
        ]

        if not tokenized_columns:
            logging.warning(
                "No tokenized columns found/specified to calculate unique tokens."
            )
            return 0

        overall_unique_tokens = set()

        for column_name in tokenized_columns:
            if column_name in self.df.columns:
                for token_list in self.df[column_name].dropna():
                    if isinstance(token_list, list):
                        overall_unique_tokens.update(token_list)
            else:
                logging.warning(f"Column {column_name} not found in DataFrame.")

        return len(overall_unique_tokens)

    def total_number_of_tokens(self):
        """
        Return the total number of tokens in the dataset (optimized).
        """
        tokenized_columns = [
            col for col in self.df.columns if col.endswith("_tokenized")
        ]

        if not tokenized_columns:
            logging.warning(
                "No tokenized columns found/specified to calculate total tokens."
            )
            return 0

        total_count = 0

        for column_name in tokenized_columns:
            if column_name in self.df.columns:
                try:
                    column_token_count = self.df[column_name].dropna().apply(len).sum()
                    total_count += column_token_count
                except TypeError as e:
                    logging.error(
                        f"Error processing column {column_name}: {e}. Check data types. Skipping column."
                    )
                except Exception as e:
                    logging.error(
                        f"Unexpected error processing column {column_name}: {e}. Skipping column."
                    )

            else:
                logging.warning(f"Column {column_name} not found in DataFrame.")

        return int(total_count)

In [45]:
analyze = TextDataAnalyzer(tokenized_df)

logging.info(f"Total number of documents: {analyze.total_number_of_documents()}")
logging.info(
    f"Total number of unique tokens: {analyze.total_number_of_unique_tokens()}"
)
logging.info(f"Total number of tokens: {analyze.total_number_of_tokens()}")

2025-04-28 16:41:31,648 - INFO - Total number of documents: 122124
2025-04-28 16:41:37,233 - INFO - Total number of unique tokens: 798954
2025-04-28 16:41:37,509 - INFO - Total number of tokens: 72220012
