In [1]:
import os
import pandas as pd

def load_data_from_directory(directory):
    reviews = []
    labels = []
    
    for label in ['pos', 'neg']:
        folder_path = os.path.join(directory, label)
        label_value = 1 if label == 'pos' else 0
        
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):  # Ensure we only read text files
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                    reviews.append(f.read())
                    labels.append(label_value)

    return pd.DataFrame({'review': reviews, 'label': labels})

# Load the training and testing data
train_df = load_data_from_directory('IMDB/train')
test_df = load_data_from_directory('IMDB/test')

# Display the first few rows of the training dataset
print(train_df.head())


                                              review  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


In [2]:
import os
import pandas as pd
import re
import string
import torch
from transformers import BertTokenizer


In [3]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers (optional)
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text


In [4]:
def load_data_from_directory(directory):
    reviews = []
    labels = []
    
    for label in ['pos', 'neg']:
        folder_path = os.path.join(directory, label)  # Using the directory parameter
        label_value = 1 if label == 'pos' else 0
        
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                    reviews.append(f.read())
                    labels.append(label_value)

    return pd.DataFrame({'review': reviews, 'label': labels})


In [5]:
train_df = load_data_from_directory('IMDB/train')
test_df = load_data_from_directory('IMDB/test')


In [6]:
train_df['review'] = train_df['review'].apply(clean_text)
test_df['review'] = test_df['review'].apply(clean_text)


In [7]:
train_df.head()

Unnamed: 0,review,label
0,bromwell high is a cartoon comedy it ran at th...,1
1,homelessness or houselessness as george carlin...,1
2,brilliant overacting by lesley ann warren best...,1
3,this is easily the most underrated film inn th...,1
4,this is not the typical mel brooks film it was...,1


In [8]:
test_df.head()

Unnamed: 0,review,label
0,i went and saw this movie last night after bei...,1
1,actor turned director bill paxton follows up h...,1
2,as a recreational golfer with some knowledge o...,1
3,i saw this film in a sneak preview and it is d...,1
4,bill paxton has taken the true story of the us...,1


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [10]:
tokens = tokenizer.batch_encode_plus(
    train_df['review'].tolist(),
    max_length=128,
    padding=True,
    truncation=True,
    return_tensors='pt'
)


In [11]:
train_input_ids = tokens['input_ids']
train_attention_mask = tokens['attention_mask']


In [12]:
train_labels = torch.tensor(train_df['label'].tolist())


In [13]:
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)


In [14]:
train_dataset = IMDBDataset(train_input_ids, train_attention_mask, train_labels)


In [15]:
test_tokens = tokenizer.batch_encode_plus(
    test_df['review'].tolist(),
    max_length=128,
    padding=True,
    truncation=True,
    return_tensors='pt'
)


In [16]:
test_input_ids = test_tokens['input_ids']
test_attention_mask = test_tokens['attention_mask']
test_labels = torch.tensor(test_df['label'].tolist())

# Create the testing dataset
test_dataset = IMDBDataset(test_input_ids, test_attention_mask, test_labels)


In [19]:
from torch.utils.data import DataLoader, TensorDataset
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Preparing our dataset for model training and evaluation.

In [22]:
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

# Initialize the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set the model in training mode
model.train()

# Training Loop
for epoch in range(3):  # Set the number of epochs
    print(f'Epoch {epoch + 1}/{3}')
    total_loss = 0

    for batch in tqdm(train_loader):
        # Move data to GPU if available
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
    
        # Clear previous gradients
        optimizer.zero_grad()
    
        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
    
        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Average Training Loss: {avg_loss}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


100%|██████████████████████████████████████████████████████████████████████████████| 782/782 [5:57:11<00:00, 27.41s/it]


Average Training Loss: 0.3356241104776597
Epoch 2/3


100%|██████████████████████████████████████████████████████████████████████████████| 782/782 [5:58:30<00:00, 27.51s/it]


Average Training Loss: 0.20139865181349276
Epoch 3/3


100%|██████████████████████████████████████████████████████████████████████████████| 782/782 [5:55:19<00:00, 27.26s/it]

Average Training Loss: 0.10194295275982593





In [26]:
model_save_path = 'bert_model'
tokenizer_save_path = 'bert_tokenizer'

# Save the trained model
model.save_pretrained(model_save_path)

# Save the tokenizer (assumed it's already initialized)
tokenizer.save_pretrained(tokenizer_save_path)

print(f'Model and tokenizer saved to {model_save_path} and {tokenizer_save_path}')

Model and tokenizer saved to bert_model and bert_tokenizer
