In [None]:
#Step 1: Data Extraction from User-Provided Files (PDF, Word, Excel, Powerpoint)

In [None]:
#pip install PyPDF2 python-docx openpyxl python-pptx in anaconda prompt

In [None]:
import os
import PyPDF2
import docx
import openpyxl
from pptx import Presentation
import re
from tokenizers import ByteLevelBPETokenizer

# 1. Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF file {pdf_path}: {e}")
    return text

# 2. Function to extract text from Word files
def extract_text_from_word(docx_path):
    text = ''
    try:
        doc = docx.Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"Error reading Word file {docx_path}: {e}")
    return text

# 3. Function to extract text from Excel files
def extract_text_from_excel(excel_path):
    text = ''
    try:
        wb = openpyxl.load_workbook(excel_path)
        for sheet_name in wb.sheetnames:
            sheet = wb[sheet_name]
            for row in sheet.iter_rows(values_only=True):
                text += ' '.join([str(cell) for cell in row if cell is not None]) + "\n"
    except Exception as e:
        print(f"Error reading Excel file {excel_path}: {e}")
    return text

# 4. Function to extract text from PowerPoint files
def extract_text_from_ppt(ppt_path):
    text = ''
    try:
        prs = Presentation(ppt_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
    except Exception as e:
        print(f"Error reading PowerPoint file {ppt_path}: {e}")
    return text

# Unified function to handle different file types
def extract_text_from_file(file_path):
    extension = os.path.splitext(file_path)[1].lower()
    
    if extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif extension == '.docx':
        return extract_text_from_word(file_path)
    elif extension == '.xlsx':
        return extract_text_from_excel(file_path)
    elif extension == '.pptx':
        return extract_text_from_ppt(file_path)
    else:
        raise ValueError(f"Unsupported file format: {extension}")

# Clean and preprocess the extracted text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\[.*?\]', '', text)  # Remove square brackets content
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

# Example usage: Handling multiple files
file_paths = [
    "path_to_file1.pdf", 
    "path_to_file2.docx", 
    "path_to_file3.xlsx", 
    "path_to_file4.pptx"
]

corpus = ''
for file_path in file_paths:
    corpus += extract_text_from_file(file_path)

cleaned_corpus = clean_text(corpus)

print("Extracted and Cleaned Text:", cleaned_corpus[:500])  # Print the first 500 characters


In [None]:
# Step 2: Tokenization

In [None]:
# Train the tokenizer on the cleaned corpus
def train_tokenizer_from_corpus(corpus, vocab_size=30000, min_frequency=2, save_path="./tokenizer"):
    # Save corpus to a temporary text file for tokenizer training
    with open("corpus.txt", "w", encoding="utf-8") as f:
        f.write(corpus)
    
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=["corpus.txt"], vocab_size=vocab_size, min_frequency=min_frequency)
    tokenizer.save_model(save_path)
    return tokenizer

# Train tokenizer on the cleaned corpus
tokenizer = train_tokenizer_from_corpus(cleaned_corpus)

# Tokenize some sample text
encoded = tokenizer.encode("Extracted data from multiple files.")
print(encoded.tokens)


In [None]:
#Step 3: Transformer based Language Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Positional encoding for the transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

# Transformer Model definition
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt)
        return self.fc_out(output)

# Training setup
def train_model(model, tokenizer, corpus, batch_size=32, epochs=10, max_seq_length=512):
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss()

    # Dummy data (since training from real data would be large-scale)
    tokenized_sequences = [tokenizer.encode(corpus).ids]  # Tokenizing corpus
    data_loader = torch.utils.data.DataLoader(tokenized_sequences, batch_size=batch_size, shuffle=True)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            # Prepare the data and pass it through the model
            src = batch[:, :-1]  # Input tokens
            tgt = batch[:, 1:]   # Target tokens (shifted by 1)
            
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(data_loader)}")

# Example usage of model training
vocab_size = tokenizer.get_vocab_size()
d_model = 512
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
max_seq_length = 512

model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)
train_model(model, tokenizer, cleaned_corpus)
