In [1]:
import os
os.chdir("../")

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_dir: Path
    base_model_path: Path
    file_path: Path
    params_num_labels: int
    params_test_size: float
    params_learning_rate: float
    params_random_state: int
    params_batch_size: int

In [3]:
from Consumer_Complaint_Analysis.constants import *
from Consumer_Complaint_Analysis.utils import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
        config = self.config
        params = self.params
        create_directories([config.prepare_base_model.root_dir])

        prepare_base_model_config = PrepareBaseModelConfig(
            root_dir=Path(config.prepare_base_model.root_dir),
            base_model_path=Path(config.prepare_base_model.base_model_path),
            file_path=Path(config.data_ingestion.csv_file_path),
            params_num_labels=params.NUM_LABELS,
            params_test_size=params.TEST_SIZE,
            params_learning_rate=params.LEARNING_RATE,
            params_random_state=params.RANDOM_STATE,
            params_batch_size=params.BATCH_SIZE
        )

        return prepare_base_model_config

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

class DataPreProcessing:
    def __init__(self, config):
        self.config = config

    def pre_process_data(self, path):
        """get pre processed data
    
        Args:
            path (str or Path): path of the file
    
        Returns:
            df: Pre Processed Data Frame
        """
    
        df = pd.read_csv(path)
    
        # Pre-processing
        # Replacing the NaN values with the most frequent value in each column
        for column in df.columns:
            df[column].fillna(df[column].mode()[0], inplace=True)

        # Convert the target column to 0 or 1
        df['Consumer disputed?'] = df['Consumer disputed?'].map({'No': 0, 'Yes': 1})
        df['Consumer disputed?'] = df['Consumer disputed?'].astype(int)
        
        return df

In [6]:
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


class PrepareBaseModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

    def analyze_firm_size_market_share(self, df):
        firm_size = df.groupby(['Company'])['Consumer complaint narrative'].count().reset_index(name='Complaint Count')
        firm_size['Market Share'] = firm_size['Complaint Count'] / firm_size['Complaint Count'].sum()
        return firm_size
    
    def analyze_population_of_state(self, df):
        state_population = df.groupby(['State'])['Consumer complaint narrative'].count().reset_index()
        state_population = state_population.rename(columns={'Consumer complaint narrative': 'Complaint Count'})
        state_population['Complaint per Capita'] = state_population['Complaint Count'] / df['State'].value_counts()
        self.state_population = state_population
        return state_population
        
    def analyze_population_of_ZIP_code(self, df):
        ZIP_code_population = df.groupby(['ZIP code'])['Consumer complaint narrative'].count().reset_index()
        ZIP_code_population = ZIP_code_population.rename(columns={'Consumer complaint narrative': 'Complaint Count'})
        ZIP_code_population['Complaint per Capita'] = ZIP_code_population['Complaint Count'] / df['ZIP code'].value_counts()
        self.ZIP_code_population = ZIP_code_population
        return ZIP_code_population

    def truncate_sequence(self,sequence, max_length=512):
        if len(sequence) > max_length:
            sequence = sequence[:max_length]
        return sequence
        
    def get_base_model(self, df):
        df = df.head(1000)
        #df = df.sample(frac=0.5)

        # Tokenization
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        sequences = df['Consumer complaint narrative'].tolist()
        truncated_sequences = [self.truncate_sequence(seq, max_length=512) for seq in sequences]
        encoded_data = tokenizer.batch_encode_plus(truncated_sequences, pad_to_max_length=True, return_attention_mask=True)
        input_ids = torch.tensor(encoded_data['input_ids'])
        attention_mask = torch.tensor(encoded_data['attention_mask'])
        labels = torch.tensor(df['Consumer disputed?'].tolist())
        train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=self.config.params_test_size, random_state=self.config.params_random_state)
        train_masks, test_masks, _, _ = train_test_split(attention_mask, input_ids, test_size=self.config.params_test_size, random_state=self.config.params_random_state)

        # Analyze firm size and market share
        self.analyze_firm_size_market_share(df)
        
        # Analyze population of a state
        self.analyze_population_of_state(df)
        
        # Analyze population of a ZIP code
        self.analyze_population_of_ZIP_code(df)

        # Creating a TensorDataset
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        test_data = TensorDataset(test_inputs, test_masks, test_labels)

        # Data Loaders
        train_dataloader = DataLoader(train_data, batch_size=self.config.params_batch_size, shuffle=True)
        test_dataloader = DataLoader(test_data, batch_size=self.config.params_batch_size, shuffle=False)

        self.dataloaders = {"train": train_dataloader, "val": test_dataloader}
        
        # Model Configuration
        model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=self.config.params_num_labels)
        self.model = model
        self.save_model(self.config.base_model_path, self.model)
        
    @staticmethod
    def save_model(path, model):
        torch.save(model.state_dict(),path)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
try:
    config = ConfigurationManager()
    prepare_base_model_config = config.get_prepare_base_model_config()
    Data = DataPreProcessing(config=prepare_base_model_config)
    df = Data.pre_process_data(prepare_base_model_config.file_path)
    prepare_base_model = PrepareBaseModel(config=prepare_base_model_config)
    prepare_base_model.get_base_model(df)
except Exception as e:
    raise e

  df = pd.read_csv(path)
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_