In [1]:
#Step 1: Data Extraction from User-Provided Files (PDF, Word, Excel, Powerpoint)

In [2]:
#pip install PyPDF2 python-docx openpyxl python-pptx in anaconda prompt

In [3]:
#!pip install python-pptx

In [4]:
#!pip install tokenizers


In [5]:
#!pip install torch

In [6]:
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118


In [7]:
import csv


In [8]:
#!pip install sentencepiece


In [9]:
#!pip install transformers


In [10]:
#!pip install tokenizers

In [11]:
import os
import PyPDF2
import docx
import openpyxl
from pptx import Presentation
import re


# 1. Function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading PDF file {pdf_path}: {e}")
    return text

# 2. Function to extract text from Word files
def extract_text_from_word(docx_path):
    text = ''
    try:
        doc = docx.Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"Error reading Word file {docx_path}: {e}")
    return text

# 3. Function to extract text from Excel files
def extract_text_from_excel(excel_path):
    text = ''
    try:
        wb = openpyxl.load_workbook(excel_path)
        for sheet_name in wb.sheetnames:
            sheet = wb[sheet_name]
            for row in sheet.iter_rows(values_only=True):
                text += ' '.join([str(cell) for cell in row if cell is not None]) + "\n"
    except Exception as e:
        print(f"Error reading Excel file {excel_path}: {e}")
    return text

# 4. Function to extract text from PowerPoint files
def extract_text_from_ppt(ppt_path):
    text = ''
    try:
        prs = Presentation(ppt_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
    except Exception as e:
        print(f"Error reading PowerPoint file {ppt_path}: {e}")
    return text

# Unified function to handle different file types
def extract_text_from_file(file_path):
    extension = os.path.splitext(file_path)[1].lower()
    
    if extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif extension == '.docx':
        return extract_text_from_word(file_path)
    elif extension == '.xlsx':
        return extract_text_from_excel(file_path)
    elif extension == '.pptx':
        return extract_text_from_ppt(file_path)
    else:
        raise ValueError(f"Unsupported file format: {extension}")

# Clean and preprocess the extracted text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\[.*?\]', '', text)  # Remove square brackets content
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

# Example usage: Handling multiple files
file_paths = [
    "C:/Users/grn61/OneDrive/Desktop/Applied Spectroscopy/Donald L. Pavia, Gary M. Lampman, George S. Kriz, James R. Vyvyan - Introduction to Spectroscopy-Cengage Learning (c2015).pdf", 
    ]

corpus = ''
for file_path in file_paths:
    corpus += extract_text_from_file(file_path)

cleaned_corpus = clean_text(corpus)

print("Extracted and Cleaned Text:", cleaned_corpus[:1000])  # Print the first 500 characters


Extracted and Cleaned Text: INTRODUCTION TO SPECTROSCOPY Donald L. Pavia Gary M. LampmanGeorge S. KrizJames R. Vyvyan Department of Chemistry Western Washington UniversityBellingham, WashingtonFIFTH EDITION Australia Brazil Mexico Singapore United Kingdom United States Copyright 2013 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. Copyright 2013 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. TOALL OF OUROS PEC STUDENTS Copyright 2013 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part. This is an electronic version of the print textbook. Due to electronic rights restrictions, some third party content may be suppressed. Editorial review has deemed that any suppressed content does not materially affect the overall learning experience. The publisher reserves the right to remove content from this title at any time if subsequent rig

In [12]:
# Step 2: Tokenization and transformer based language model

In [13]:
import os
import tempfile
import torch
import torch.nn as nn
import torch.optim as optim
import math
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
from tokenizers.normalizers import BertNormalizer

# Function to train and load a WordPiece tokenizer
def train_tokenizer_from_corpus(cleaned_corpus, save_path="./tokenizer"):
    """Train a WordPiece tokenizer on the cleaned corpus and save the model."""
    os.makedirs(save_path, exist_ok=True)

    # Save the cleaned corpus to a temporary file
    temp_file_path = os.path.join(save_path, "temp.txt")
    with open(temp_file_path, "w", encoding="utf-8") as f:
        f.write(cleaned_corpus)

    # Initialize the tokenizer
    tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = BertNormalizer()
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Create a trainer
    trainer = trainers.WordPieceTrainer(vocab_size=30000, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    
    # Train the tokenizer
    tokenizer.train([temp_file_path], trainer)

    # Save the tokenizer
    tokenizer.save(os.path.join(save_path, "tokenizer.json"))

    return tokenizer

# Function to encode sentences using the trained tokenizer
def encode_sentences(tokenizer, sentences):
    return [tokenizer.encode(sentence).ids for sentence in sentences]

# Dataset class for tokenized sequences with efficient padding
class TokenizedDataset(Dataset):
    def __init__(self, tokenized_sequences, max_seq_length):
        self.sequences = tokenized_sequences
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        seq = seq[:self.max_seq_length] + [0] * (self.max_seq_length - len(seq))  # Padding
        return torch.tensor(seq)

# Positional encoding for the transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Transformer Model definition
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(self.embedding.weight.size(1))
        tgt = self.embedding(tgt) * math.sqrt(self.embedding.weight.size(1))
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        output = self.transformer(src, tgt)
        return self.fc_out(output)

# Training function
def train_model(model, tokenizer, cleaned_corpus, batch_size=32, epochs=10, max_seq_length=512, device='cuda'):
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Tokenizing the cleaned corpus
    tokenized_sequences = [tokenizer.encode(sentence).ids for sentence in cleaned_corpus.split('\n') if sentence]
    
    if not tokenized_sequences:
        print("No sentences to train on. Please check the cleaned corpus.")
        return

    dataset = TokenizedDataset(tokenized_sequences, max_seq_length)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            batch = batch.to(device)
            src = batch[:, :-1]
            tgt = batch[:, 1:]

            optimizer.zero_grad()
            output = model(src, tgt)

            loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(data_loader):.4f}")

# Save the trained model
def save_model(model, filepath):
    torch.save(model.state_dict(), filepath)

# Load the trained model
def load_model(filepath, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length):
    model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)
    model.load_state_dict(torch.load(filepath))
    model.eval()  # Set model to evaluation mode
    return model

# Example cleaned corpus string
#cleaned_corpus = """INTRODUCTION TO SPECTROSCOPY Donald L. Pavia Gary M. Lampman George S. Kriz James R. Vyvyan Department of Chemistry Western Washington University Bellingham, Washington FIFTH EDITION Australia Brazil Mexico Singapore United Kingdom United States Copyright 2013 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part."""

# Train the tokenizer
tokenizer = train_tokenizer_from_corpus(cleaned_corpus)  
if tokenizer is None:
    print("Tokenizer training failed.")
else:
    # Set model parameters
    vocab_size = tokenizer.get_vocab_size()
    d_model = 1024
    nhead = 16
    num_encoder_layers = 12
    num_decoder_layers = 12
    dim_feedforward = 4096
    max_seq_length = 512

    # Create and train the model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)
    train_model(model, tokenizer, cleaned_corpus, batch_size=32, epochs=10, max_seq_length=max_seq_length, device=device)

    # Save the trained model
    save_model(model, "transformer_model.pth")

    # Load the model later if needed
    # model = load_model("transformer_model.pth", vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch [1/10], Loss: 9.5966
Epoch [2/10], Loss: 8.3256
Epoch [3/10], Loss: 7.9090
Epoch [4/10], Loss: 7.7104
Epoch [5/10], Loss: 7.4505
Epoch [6/10], Loss: 7.2340
Epoch [7/10], Loss: 7.0184
Epoch [8/10], Loss: 6.7732
Epoch [9/10], Loss: 6.5030
Epoch [10/10], Loss: 6.2421


In [14]:
import torch

# Function to find frequency and locations of user-defined words/phrases
def find_word_locations(cleaned_corpus, user_words):
    word_locations = {}
    for word in user_words:
        locations = []
        start = 0
        while True:
            start = cleaned_corpus.find(word, start)
            if start == -1:
                break
            locations.append(start)
            start += len(word)  # Move past the last found word
        word_locations[word] = {
            'frequency': len(locations),
            'locations': locations
        }
    return word_locations

# Function to predict the next word
def predict_next_word(model, tokenizer, input_sequence, device='cuda'):
    model.eval()
    # Tokenize the input sequence
    tokenized_input = tokenizer.encode(input_sequence).ids
    input_tensor = torch.tensor(tokenized_input).unsqueeze(0).to(device)

    # Prepare source and target for prediction
    src = input_tensor
    tgt = input_tensor  # Here, you can use the same tensor for simplicity

    with torch.no_grad():
        output = model(src, tgt)
    
    # Get the last predicted word
    last_word_logits = output[-1, -1, :]  # Last time step, last token
    predicted_word_index = last_word_logits.argmax().item()
    predicted_word = tokenizer.decode([predicted_word_index])

    return predicted_word

# Example cleaned corpus string (make sure it is set correctly)
# cleaned_corpus = "..." 

# Define your words/phrases to search
user_words = ["print", "title", "may"]

# Find frequency and locations
word_locations = find_word_locations(cleaned_corpus, user_words)
print("Word Locations and Frequencies:")
for word, info in word_locations.items():
    print(f"{word}: Frequency = {info['frequency']}, Locations = {info['locations']}")

# Example of predicting the next word
input_sequence = "mass spectroscopy"  # Define the sequence
predicted_word = predict_next_word(model, tokenizer, input_sequence, device)
print(f"Predicted next word: {predicted_word}")


Word Locations and Frequencies:
print: Frequency = 33, Locations = [661, 64406, 201600, 202178, 239572, 253029, 253691, 261473, 265511, 275482, 276724, 457338, 489791, 630878, 724966, 725096, 725176, 740082, 742239, 873447, 873855, 883791, 888419, 888898, 1033381, 1035243, 1055775, 1063158, 1072805, 1073875, 1074257, 1142996, 1157340]
title: Frequency = 8, Locations = [937, 1173, 8439, 8471, 8745, 10963, 10995, 1020518]
may: Frequency = 258, Locations = [741, 1452, 6143, 6540, 25964, 27300, 29582, 30179, 32057, 32079, 36625, 37668, 38008, 39691, 41091, 41141, 44153, 48846, 55903, 60117, 62019, 69237, 70684, 74332, 77883, 78238, 87966, 90552, 94095, 94569, 100953, 101230, 110282, 111337, 111603, 112282, 113099, 113150, 114556, 122594, 126757, 129699, 134997, 135720, 143237, 143486, 143859, 153530, 158433, 159942, 160098, 160378, 170413, 175753, 179212, 182327, 182894, 183312, 187574, 194085, 203788, 206610, 212990, 215016, 217772, 219494, 219664, 226641, 240018, 242451, 245785, 247329, 