In [21]:
import pandas as pd

dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,domain,sentence
0,restaurants,But the staff was so horrible to us .
1,restaurants,To be completely fair the only redeeming fact...
2,restaurants,The food is uniformly exceptional with a very...
3,restaurants,Not only was the food outstanding but the lit...
4,restaurants,Our agreed favorite is the orrechiete with sau...


In [22]:
#library
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [23]:
# Function to remove stopwords and punctuation
def remove_stopwords_punctuation(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)


In [24]:
# re-pre-processed data
all_sentences_string = ' '.join(dataset['sentence'].tolist())
combined_documents = all_sentences_string
nlp.max_length = len(combined_documents) + 100
sentence = remove_stopwords_punctuation(combined_documents)


In [25]:
doc = nlp(sentence)
sentences = list(doc.sents)

nouns = []
noun_phrases = []

# Iterate over sentences to extract nouns and noun_phrases
for sentence in sentences:
    for chunk in sentence.noun_chunks:
        noun_phrases.append(chunk.text)
    for token in sentence:
        if token.pos_ == "NOUN":
            nouns.append(token.text)

combined_nouns = noun_phrases

In [26]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizing a noun phrase
encoded_input = tokenizer(combined_nouns[0], padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[ 101, 3095,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [27]:
from torch.utils.data import Dataset

class DataFrameDataset(Dataset):
    """A custom Dataset class for pandas DataFrames."""
    
    def __init__(self, dataframe, tokenizer, max_length):
        """
        Args:
            dataframe (pandas.DataFrame): The DataFrame containing the data.
            tokenizer: The tokenizer used to preprocess the text data.
            max_length (int): Maximum length of the tokenized output.
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        """Returns the size of the dataframe."""
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        """
        Generates one sample of data.
        
        Args:
            idx (int): The index of the item.
        
        Returns:
            A dictionary containing the input IDs, attention mask, and labels.
        """
        # Assuming 'text' and 'label' columns exist in your DataFrame
        text = self.dataframe.iloc[idx]['sentence']  # 'text' column should contain your sentences
        label = self.dataframe.iloc[idx]['domain']  # 'label' column should contain your labels
        
        # Tokenize the text
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        
        # Extract input IDs and attention mask and return them along with the label
        input_ids = inputs['input_ids'].squeeze()  # Squeeze is used to remove batch dimension
        attention_mask = inputs['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label 
        }


In [28]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Assuming dataset is your pandas DataFrame
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

train_dataset = DataFrameDataset(train_df, tokenizer, max_length)
val_dataset = DataFrameDataset(val_df, tokenizer, max_length)

In [29]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1be1a187-e5f7-4e05-890a-d782d22fa719)')