In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/macbookpro/Documents/Documents - Macbook’s MacBook Pro/career/career_chief_rep'

# Configure config.yaml which holds information about where our data will be stored

In [4]:
# Configuration related to data transformation
data_transformation:
  # Directory where data transformation results and artifacts are stored
  root_dir: artifacts/data_transformation
  
  # Path to the ingested data file that will be used for validation
  data_source_file: artifacts/data_ingestion/gsearch_jobs.csv

  # Path to data validation status
  data_validation: artifacts/data_validation/status.txt

SyntaxError: invalid syntax (1617510383.py, line 2)

# Setup Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Configuration for the data transformation process.
    
    This configuration class captures the necessary paths and directories 
    required for the transformation of data post-ingestion and pre-model training.
    
    Attributes:
    - root_dir: Directory where data transformation results and artifacts are stored.
    - data_source_file: Path to the file where the ingested data is stored that needs to be transformed.
    """
    
    root_dir: Path  # Directory for storing transformation results and related artifacts
    data_source_file: Path  # Path to the ingested data file for transformation
    data_validation: Path # Path to the validated output file

# Configure the Configuration Manager

In [6]:
from src.career_chief.constants import *
from src.career_chief.utils.common import read_yaml, create_directories
from src.career_chief import logger
from src.career_chief.entity.config_entity import (DataIngestionConfig, DataValidationConfig)

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Extract and return data transformation configurations as a DataTransformationConfig object.

        This method fetches settings related to data transformation, like directories and file paths,
        and returns them as a DataTransformationConfig object.

        Returns:
        - DataTransformationConfig: Object containing data transformation configuration settings.

        Raises:
        - AttributeError: If the 'data_transformation' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_transformation
            
            # Ensure the root directory for data transformation exists
            create_directories([config.root_dir])

            # Construct and return the DataTransformationConfig object
            return DataTransformationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                data_validation=Path(config.data_validation),
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_transformation' attribute does not exist in the config file.")
            raise e

# Build Component

In [7]:
'''
Checklist

1. Check data validation status
2. Read data
3. Remove noise specific to technical resumes such as code snippets and special characters, and emojis
4. Normalize by standardizing technical terminologies and acronyms
5. Remove stop words and irrelevant words such as "@" in twitter mentions, or urls, pronouns, prepositions and conjunctions, etc
6. Lemmatization, here we want the same token for different word forms, e.g. wolves and wolf, or talks and talk
7. Stemming, here we remove and replace suffixes to get the root form of the word
8. Tokenize the cleaned and normalized text to prepare it for further NLP and ML processing. tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER ")
'''

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tqdm.auto import tqdm  # Optional, for progress bar support
import re
import spacy

from src.career_chief import logger
from src.career_chief.entity.config_entity import DataTransformationConfig

class DataTransformation:
    """
    Processes technical resume data for NLP tasks, performing cleaning, normalization,
    and preparation steps such as stop word removal, lemmatization, stemming, and tokenization.
    """
    
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.nlp = spacy.load("en_core_web_sm")  # Load spaCy model
        # Ensure the lemmatizer is in the pipeline
        if not self.nlp.has_pipe("lemmatizer"):
            self.nlp.add_pipe("lemmatizer")
        self.df = self._load_data()
        self.stop_words = spacy.lang.en.stop_words.STOP_WORDS
        self._handle_missing_values()
        self.nlp_pipeline = self._initialize_nlp_pipeline()


    # def _download_nltk_resources(self):
    #     """Download necessary NLTK resources."""
    #     nltk.download('stopwords')
    #     nltk.download('wordnet')
    #     nltk.download('omw-1.4')
    #     logger.info("NLTK resources downloaded successfully.")

    # def _initialize_stop_words(self):
    #     """Initialize the list of stop words from NLTK."""
    #     logger.info("Stop words initialized.")
    #     return set(stopwords.words('english'))

    def _load_data(self):
        """Read data from the configured source."""
        try:
            return pd.read_csv(self.config.data_source_file)
        except Exception as e:
            logger.error(f"Failed to load data: {e}")
            raise

    def _handle_missing_values(self):
        """Remove records with missing values in critical columns only."""
        critical_columns = ['description']  # List critical columns here
        initial_shape = self.df.shape
        self.df.dropna(subset=critical_columns, inplace=True)
        final_shape = self.df.shape
        logger.info(f"Missing values handled in critical columns. Rows before: {initial_shape[0]}, after: {final_shape[0]}.")
        print(self.df)


    def remove_noise(self, df):
        """Remove noise such as code snippets, special characters, and emojis."""
        # Example regex to remove special characters and emojis
        df['cleaned_text'] = df['description'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))
        logger.info("Noise removed from the text.")
        return df

    def normalize_technical_terms(self, df):
        """Normalize technical terminologies and acronyms."""
        normalization_dict = {
            'GPU': 'Graphics Processing Unit',
            'CPU': 'Central Processing Unit',
            'RDBMS': 'Relational Database Management System',
            'ORM': 'Object-Relational Mapping',
            'SSD': 'Solid State Drive',
            'HDD': 'Hard Disk Drive',
            'IOPS': 'Input/Output Operations Per Second',
            'RAID': 'Redundant Array of Independent Disks',
            'NAS': 'Network Attached Storage',
            'SAN': 'Storage Area Network',
            'NIC': 'Network Interface Card',
            'VLAN': 'Virtual Local Area Network',
            'DNS': 'Domain Name System',
            'DHCP': 'Dynamic Host Configuration Protocol',
            'NAT': 'Network Address Translation',
            'VPN': 'Virtual Private Network',
            'TCP': 'Transmission Control Protocol',
            'UDP': 'User Datagram Protocol',
            'IP': 'Internet Protocol',
            'FTP': 'File Transfer Protocol',
            'SFTP': 'SSH File Transfer Protocol',
            'SMTP': 'Simple Mail Transfer Protocol',
            'POP': 'Post Office Protocol',
            'IMAP': 'Internet Message Access Protocol',
            'HTTP': 'Hypertext Transfer Protocol',
            'HTTPS': 'Hypertext Transfer Protocol Secure',
            'SSL': 'Secure Sockets Layer',
            'TLS': 'Transport Layer Security',
            'SSH': 'Secure Shell',
            'CIDR': 'Classless Inter-Domain Routing',
            'ARP': 'Address Resolution Protocol',
            'RARP': 'Reverse Address Resolution Protocol',
            'MPLS': 'Multiprotocol Label Switching',
            'BGP': 'Border Gateway Protocol',
            'OSPF': 'Open Shortest Path First',
            'VRRP': 'Virtual Router Redundancy Protocol',
            'HSRP': 'Hot Standby Router Protocol',
            'SNMP': 'Simple Network Management Protocol',
            'WPA': 'Wi-Fi Protected Access',
            'WPA2': 'Wi-Fi Protected Access 2',
            'WEP': 'Wired Equivalent Privacy',
            'AP': 'Access Point',
            'SSID': 'Service Set Identifier',
            'IoT': 'Internet of Things',
            'IIoT': 'Industrial Internet of Things',
            'RFID': 'Radio-Frequency Identification',
            'BLE': 'Bluetooth Low Energy',
            'ZigBee': 'ZigBee',
            'Z-Wave': 'Z-Wave',
            'MQTT': 'MQ Telemetry Transport',
            'CoAP': 'Constrained Application Protocol',
            'AMQP': 'Advanced Message Queuing Protocol',
            'DDS': 'Data Distribution Service',
            'REST': 'Representational State Transfer',
            'SOAP': 'Simple Object Access Protocol',
            'JSON': 'JavaScript Object Notation',
            'XML': 'eXtensible Markup Language',
            'CSV': 'Comma Separated Values',
            'YAML': 'YAML Ain\'t Markup Language',
            'TOML': 'Tom\'s Obvious, Minimal Language',
            'INI': 'Initialization File',
            'SDK': 'Software Development Kit',
            'IDE': 'Integrated Development Environment',
            'CI': 'Continuous Integration',
            'CD': 'Continuous Delivery',
            'VCS': 'Version Control System',
            'SCM': 'Source Code Management',
            'DVC': 'Data Version Control',
            'NFT': 'Non-Fungible Token',
            'DeFi': 'Decentralized Finance',
            'DAO': 'Decentralized Autonomous Organization',
            'DApp': 'Decentralized Application',
            'IPFS': 'InterPlanetary File System',
            'AI': 'Artificial Intelligence',
            'ML': 'Machine Learning',
            'OOP': 'Object Oriented Programming',
            'API': 'Application Programming Interface',
            'CLI': 'Command-Line Interface',
            'NLP': 'Natural Language Processing',
            'CV': 'Computer Vision',
            'DL': 'Deep Learning',
            'RNN': 'Recurrent Neural Network',
            'CNN': 'Convolutional Neural Network',
            'GAN': 'Generative Adversarial Network',
            'EDA': 'Exploratory Data Analysis',
            'TPU': 'Tensor Processing Unit',
            'SQL': 'Structured Query Language',
            'NoSQL': 'Not Only SQL',
            'BI': 'Business Intelligence',
            'ETL': 'Extract, Transform, Load',
            'SaaS': 'Software as a Service',
            'PaaS': 'Platform as a Service',
            'IaaS': 'Infrastructure as a Service',
            'DBMS': 'Database Management System',
            'MVC': 'Model-View-Controller',
            'MVP': 'Minimum Viable Product',
            'CRUD': 'Create, Read, Update, Delete',
            'DevOps': 'Development and Operations',
            'SDLC': 'Software Development Life Cycle',
            'Agile': 'Agile Software Development',
            'Scrum': 'Scrum Framework',
            'Kanban': 'Kanban Methodology',
            'Git': 'Git Version Control',
            'SVN': 'Subversion Version Control',
            'JIRA': 'JIRA Software',
            'TDD': 'Test-Driven Development',
            'BFS': 'Breadth-First Search',
            'DFS': 'Depth-First Search',
            'SVM': 'Support Vector Machines',
            'PCA': 'Principal Component Analysis',
            'LDA': 'Latent Dirichlet Allocation',
            'UUID': 'Universally Unique Identifier',
            'JWT': 'JSON Web Token',
            'OAuth': 'Open Authorization',
            'SAST': 'Static Application Security Testing',
            'DAST': 'Dynamic Application Security Testing',
            'XSS': 'Cross-Site Scripting',
            'CSRF': 'Cross-Site Request Forgery',
            'CI/CD': 'Continuous Integration and Continuous Delivery',
            'FaaS': 'Function as a Service',
            'BaaS': 'Backend as a Service',
            'MBaaS': 'Mobile Backend as a Service',
            'CaaS': 'Container as a Service',
            'DaaS': 'Data as a Service',
            'K8s': 'Kubernetes',
            'IaC': 'Infrastructure as Code',
            'PWA': 'Progressive Web App',
            'SPA': 'Single Page Application',
            'SSR': 'Server-Side Rendering',
            'CSR': 'Client-Side Rendering',
            'CDN': 'Content Delivery Network',
            'SEO': 'Search Engine Optimization',
            'SEM': 'Search Engine Marketing',
            'SRE': 'Site Reliability Engineering',
            'MLOps': 'Machine Learning Operations',
            'AIOps': 'Artificial Intelligence for IT Operations',
            'DataOps': 'Data Operations',
            'DevSecOps': 'Development, Security, and Operations',
            'GitOps': 'Operations by Pull Request',
            'FinTech': 'Financial Technology',
            'HealthTech': 'Health Technology',
            'EdTech': 'Education Technology',
            'AgriTech': 'Agriculture Technology',
            'RegTech': 'Regulatory Technology',
            'InsurTech': 'Insurance Technology',
            'LegalTech': 'Legal Technology',
            'Quant': 'Quantitative Analyst',
            'Big Data': 'Large Scale Data Processing',
            'DWH': 'Data Warehousing',
            'OLTP': 'Online Transaction Processing',
            'OLAP': 'Online Analytical Processing',
            'ETL': 'Extract, Transform, Load',
            'ELT': 'Extract, Load, Transform',
            'BI': 'Business Intelligence',
            'DA': 'Data Analysis/Data Analytics',
            'DS': 'Data Science',
            'DE': 'Data Engineering',
            'DL': 'Deep Learning',
            'AI': 'Artificial Intelligence',
            'ML': 'Machine Learning',
            'NLP': 'Natural Language Processing',
            'CV': 'Computer Vision',
            'AR': 'Augmented Reality',
            'VR': 'Virtual Reality',
            'MR': 'Mixed Reality',
            'XR': 'Extended Reality',
            'IoT': 'Internet of Things',
            'IIoT': 'Industrial Internet of Things',
            'UI': 'User Interface',
            'UX': 'User Experience',
            'CX': 'Customer Experience',
            'GA': 'Google Analytics',
            'CRO': 'Conversion Rate Optimization',
            'KPI': 'Key Performance Indicator',
            'ROI': 'Return on Investment',
            'COGS': 'Cost of Goods Sold',
            'CAC': 'Customer Acquisition Cost',
            'CLV': 'Customer Lifetime Value',
            'MRR': 'Monthly Recurring Revenue',
            'ARR': 'Annual Recurring Revenue',
            'LTV': 'Lifetime Value',
            'A/B Testing': 'Split Testing',
            'CI/CD': 'Continuous Integration and Continuous Delivery',
            'ERP': 'Enterprise Resource Planning',
            'CRM': 'Customer Relationship Management',
            'CMS': 'Content Management System',
            'LMS': 'Learning Management System',
            'VLE': 'Virtual Learning Environment',
            'MOOC': 'Massive Open Online Course',
            'Webinar': 'Web-Based Seminar',
            'Fintech': 'Financial Technology',
            'Insurtech': 'Insurance Technology',
            'Healthtech': 'Health Technology',
            'Edtech': 'Education Technology',
            'NFT': 'Non-Fungible Token',
            'DeFi': 'Decentralized Finance',
            'DAO': 'Decentralized Autonomous Organization',
            'DApp': 'Decentralized Application',
            'IPFS': 'InterPlanetary File System',
            'ICO': 'Initial Coin Offering',
            'FaaS': 'Function as a Service',
            # The list continues...
        }
        
        def normalize_text(text):
            for term, normal_form in normalization_dict.items():
                text = re.sub(r'\b{}\b'.format(term), normal_form, text, flags=re.IGNORECASE)
            return text
        df['cleaned_text'] = df['cleaned_text'].apply(normalize_text)
        logger.info("Technical terminologies and acronyms normalized.")
        return df
    
    def remove_stop_words(self, df):
        """Remove stop words and irrelevant words."""
        df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in str(x).split() if word.lower() not in self.stop_words]))
        logger.info("Stop words removed.")
        return df

    
    # def lemmatize_text(self, df):
    #     """Lemmatize the text to get base forms of words."""
    #     lemmatizer = WordNetLemmatizer()
    #     df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in str(x).split()]))
    #     logger.info("Text lemmatized.")
    #     return df
    
    def lemmatize_text(self, df):
        """
        Lemmatize the text using spaCy's lemmatizer added to the pipeline.
        Utilizes .pipe for efficient batch processing.
        """
        def lemmatize(texts):
            lemmatized_texts = []
            for doc in self.nlp.pipe(texts, batch_size=50):
                lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
            return lemmatized_texts
        
        df['cleaned_text'] = lemmatize(df['description'].tolist())
        logger.info("Text lemmatized using spaCy.")
        return df


    def stem_text(self, df):
        """Stem the text to get root forms of words."""
        stemmer = PorterStemmer()
        df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
        logger.info("Text stemmed.")
        return df
    
    def tokenize_text(self, df):
        """Tokenize the cleaned and normalized text."""
        # The tokenizer expects a list of texts, hence using `.tolist()`
        tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
        df['tokens'] = df['cleaned_text'].apply(lambda x: tokenizer.tokenize(str(x)))
        logger.info("Text tokenized.")
        return df

    def _initialize_nlp_pipeline(self):
        """Initialize the NLP pipeline for Named Entity Recognition (NER)."""
        
        self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
        self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
        self.nlp_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
        logger.info("NER pipeline initialized.")


    def apply_ner(self, df):
        """
        Applies Named Entity Recognition (NER) to the 'cleaned_text' column in batches,
        with improved error handling and efficiency.
        """
        batch_size = 100  # Adjust batch size based on memory and performance
        total_batches = (len(df) + batch_size - 1) // batch_size  # Calculate total number of batches

        # Initialize an empty list to store NER results
        ner_results = []

        # Process DataFrame in batches
        for batch_num in tqdm(range(total_batches), desc="Applying NER"):
            start_idx = batch_num * batch_size
            end_idx = min((batch_num + 1) * batch_size, len(df))
            batch_texts = df.iloc[start_idx:end_idx]['cleaned_text'].tolist()

            try:
                # Apply NER pipeline to the entire list of texts in the batch
                batch_ner_results = [self.nlp_pipeline(text) for text in batch_texts]
                ner_results.extend(batch_ner_results)
            except Exception as e:
                logger.error(f"NER application failed for batch {batch_num} (indices {start_idx}-{end_idx}): {e}")
                # Append None or a specific error indicator for each item in the failed batch
                ner_results.extend([None] * (end_idx - start_idx))

        # Assign the NER results back to the DataFrame
        df['ner_results'] = ner_results
        logger.info("Named Entity Recognition applied in batches.")
        return df

    
    def transform_data(self):
        """
        Applies all preprocessing steps to self.df, the internal DataFrame.
        """
        # Use self.df directly for transformations
        self.df = self.remove_noise(self.df)
        self.df = self.normalize_technical_terms(self.df)
        self.df = self.remove_stop_words(self.df)
        self.df = self.lemmatize_text(self.df)
        self.df = self.stem_text(self.df)
        self.df = self.tokenize_text(self.df)
        self.df = self.apply_ner(self.df)
    
    def split_data(self, test_size=0.2, val_size=0.1, random_state=None):
        """
        Splits the DataFrame into training, testing, and validation sets.

        Args:
            test_size (float): Fraction of the dataset to be used as test set.
            val_size (float): Fraction of the dataset to be used as validation set.
            random_state (int): Seed for random splitting for reproducibility.

        Returns:
            tuple: Three DataFrames for training, testing, and validation sets.
        """
        train_val_df, test_df = train_test_split(self.df, test_size=test_size, random_state=random_state)
        adjusted_val_size = val_size / (1 - test_size)
        train_df, val_df = train_test_split(train_val_df, test_size=adjusted_val_size, random_state=random_state)
        return train_df, test_df, val_df

    
    def save_transformed_data(self, df, filename):
        """Saves the transformed DataFrame to the specified file within root_dir.

        Args:
            df (pd.DataFrame): The DataFrame to save.
            filename (str): The filename to use for saving the DataFrame.
        """
        # Directly use self.config.root_dir without adding 'data_transformation'
        file_path = os.path.join(self.config.root_dir, filename)
        try:
            df.to_csv(file_path, index=False)
            logger.info(f"Transformed data saved to {file_path}")
        except Exception as e:
            logger.error(f"Failed to save transformed data: {e}")



  from .autonotebook import tqdm as notebook_tqdm


[2024-03-10 16:06:29,999: 58: datasets: INFO: config:  PyTorch version 2.1.2 available.]


# Build Pipeline

In [8]:
class DataTransformationPipeline:
    STAGE_NAME = "Data Transformation Pipeline"

    def __init__(self):
        """
        Initializes the DataTransformationPipeline with necessary configurations
        obtained from the ConfigurationManager.
        """
        self.config_manager = ConfigurationManager()
        transformation_config = self.config_manager.get_data_transformation_config()
        self.data_transformation = DataTransformation(transformation_config)

    def run_pipeline(self):
        logger.info(f"{self.STAGE_NAME}: Starting data transformation process.")

        # Process the loaded data
        self.data_transformation.transform_data()

        # After processing, self.df is updated and ready for splitting and conversion to datasets
        train_df, test_df, val_df = self.data_transformation.split_data()

        # Convert the split DataFrames to Dataset objects
        train_ds = Dataset.from_pandas(train_df)
        test_ds = Dataset.from_pandas(test_df)
        val_ds = Dataset.from_pandas(val_df)

        # Optionally, save the transformed datasets
        self.data_transformation.save_transformed_data(train_df, 'train_data.csv')
        self.data_transformation.save_transformed_data(test_df, 'test_data.csv')
        self.data_transformation.save_transformed_data(val_df, 'val_data.csv')

        logger.info(f"{self.STAGE_NAME}: Data transformation process completed.")
        return train_ds, test_ds, val_ds


if __name__ == '__main__':
    pipeline = DataTransformationPipeline()
    train_ds, test_ds, val_ds = pipeline.run_pipeline()
    print(f"Training dataset size: {len(train_ds)}")
    print(f"Testing dataset size: {len(test_ds)}")
    print(f"Validation dataset size: {len(val_ds)}")


[2024-03-10 16:06:31,252: 41: career_chief_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2024-03-10 16:06:31,255: 41: career_chief_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2024-03-10 16:06:31,262: 41: career_chief_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2024-03-10 16:06:31,263: 64: career_chief_logger: INFO: common:  Created directory at: artifacts]
[2024-03-10 16:06:31,263: 64: career_chief_logger: INFO: common:  Created directory at: artifacts/data_transformation]
[2024-03-10 16:06:33,927: 73: career_chief_logger: INFO: 2596414057:  Missing values handled in critical columns. Rows before: 37962, after: 37962.]
       Unnamed: 0  index                                              title  \
0               0      0                                       Data Analyst   
1               1      1                                       Data Analyst   
2               2      2                          Aeronauti

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[2024-03-10 16:06:43,223: 350: career_chief_logger: INFO: 2596414057:  NER pipeline initialized.]
[2024-03-10 16:06:43,225: 14: career_chief_logger: INFO: 1216590037:  Data Transformation Pipeline: Starting data transformation process.]
[2024-03-10 16:06:45,326: 81: career_chief_logger: INFO: 2596414057:  Noise removed from the text.]
[2024-03-10 16:13:37,001: 296: career_chief_logger: INFO: 2596414057:  Technical terminologies and acronyms normalized.]
[2024-03-10 16:13:40,282: 302: career_chief_logger: INFO: 2596414057:  Stop words removed.]
[2024-03-10 17:46:25,236: 325: career_chief_logger: INFO: 2596414057:  Text lemmatized using spaCy.]
[2024-03-10 17:49:41,731: 333: career_chief_logger: INFO: 2596414057:  Text stemmed.]


Token indices sequence length is longer than the specified maximum sequence length for this model (785 > 512). Running this sequence through the model will result in indexing errors


[2024-03-10 17:50:59,051: 341: career_chief_logger: INFO: 2596414057:  Text tokenized.]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Applying NER:   0%|          | 0/380 [00:00<?, ?it/s]

[2024-03-10 17:50:59,239: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 0 (indices 0-100): 'NoneType' object is not callable]
[2024-03-10 17:50:59,240: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 1 (indices 100-200): 'NoneType' object is not callable]
[2024-03-10 17:50:59,241: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 2 (indices 200-300): 'NoneType' object is not callable]
[2024-03-10 17:50:59,241: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 3 (indices 300-400): 'NoneType' object is not callable]
[2024-03-10 17:50:59,242: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 4 (indices 400-500): 'NoneType' object is not callable]
[2024-03-10 17:50:59,242: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 5 (indices 500-600): 'NoneType' object is not callable]
[2024-03-10 17:50:59,243: 375: caree

Applying NER:  27%|██▋       | 103/380 [00:00<00:00, 1027.67it/s]

[2024-03-10 17:50:59,322: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 103 (indices 10300-10400): 'NoneType' object is not callable]
[2024-03-10 17:50:59,323: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 104 (indices 10400-10500): 'NoneType' object is not callable]
[2024-03-10 17:50:59,323: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 105 (indices 10500-10600): 'NoneType' object is not callable]
[2024-03-10 17:50:59,324: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 106 (indices 10600-10700): 'NoneType' object is not callable]
[2024-03-10 17:50:59,324: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 107 (indices 10700-10800): 'NoneType' object is not callable]
[2024-03-10 17:50:59,325: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 108 (indices 10800-10900): 'NoneType' object is not callable

Applying NER:  54%|█████▍    | 206/380 [00:04<00:04, 42.13it/s]  

[2024-03-10 17:51:03,412: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 206 (indices 20600-20700): 'NoneType' object is not callable]
[2024-03-10 17:51:03,414: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 207 (indices 20700-20800): 'NoneType' object is not callable]
[2024-03-10 17:51:03,415: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 208 (indices 20800-20900): 'NoneType' object is not callable]
[2024-03-10 17:51:03,416: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 209 (indices 20900-21000): 'NoneType' object is not callable]
[2024-03-10 17:51:03,417: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 210 (indices 21000-21100): 'NoneType' object is not callable]
[2024-03-10 17:51:03,418: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 211 (indices 21100-21200): 'NoneType' object is not callable

Applying NER:  74%|███████▍  | 282/380 [00:04<00:01, 65.98it/s]

[2024-03-10 17:51:03,509: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 282 (indices 28200-28300): 'NoneType' object is not callable]
[2024-03-10 17:51:03,511: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 283 (indices 28300-28400): 'NoneType' object is not callable]
[2024-03-10 17:51:03,512: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 284 (indices 28400-28500): 'NoneType' object is not callable]
[2024-03-10 17:51:03,513: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 285 (indices 28500-28600): 'NoneType' object is not callable]
[2024-03-10 17:51:03,514: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 286 (indices 28600-28700): 'NoneType' object is not callable]
[2024-03-10 17:51:03,515: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 287 (indices 28700-28800): 'NoneType' object is not callable

Applying NER:  98%|█████████▊| 371/380 [00:04<00:00, 103.28it/s]

[2024-03-10 17:51:03,609: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 371 (indices 37100-37200): 'NoneType' object is not callable]
[2024-03-10 17:51:03,610: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 372 (indices 37200-37300): 'NoneType' object is not callable]
[2024-03-10 17:51:03,611: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 373 (indices 37300-37400): 'NoneType' object is not callable]
[2024-03-10 17:51:03,614: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 374 (indices 37400-37500): 'NoneType' object is not callable]
[2024-03-10 17:51:03,615: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 375 (indices 37500-37600): 'NoneType' object is not callable]
[2024-03-10 17:51:03,616: 375: career_chief_logger: ERROR: 2596414057:  NER application failed for batch 376 (indices 37600-37700): 'NoneType' object is not callable

Applying NER: 100%|██████████| 380/380 [00:04<00:00, 86.42it/s] 

[2024-03-10 17:51:03,625: 381: career_chief_logger: INFO: 2596414057:  Named Entity Recognition applied in batches.]





[2024-03-10 17:51:44,067: 427: career_chief_logger: INFO: 2596414057:  Transformed data saved to artifacts/data_transformation/train_data.csv]
[2024-03-10 17:51:47,598: 427: career_chief_logger: INFO: 2596414057:  Transformed data saved to artifacts/data_transformation/test_data.csv]
[2024-03-10 17:51:51,447: 427: career_chief_logger: INFO: 2596414057:  Transformed data saved to artifacts/data_transformation/val_data.csv]
[2024-03-10 17:51:51,450: 32: career_chief_logger: INFO: 1216590037:  Data Transformation Pipeline: Data transformation process completed.]
Training dataset size: 26572
Testing dataset size: 7593
Validation dataset size: 3797
