In [1]:
import re
import pandas as pd

def clean_text(text):
  # Lowercase text
  text = text.lower()
  # Remove special characters
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text)
  return text

def preprocess_data(data):
  # Load data (replace with your data loading logic)
  data = pd.read_csv("data.csv")
  
  # Clean text data in a specific column
  data["text_column"] = data["text_column"].apply(clean_text)
  
  # Handle missing values (e.g., impute or remove rows)
  data.dropna(subset=["text_column"], inplace=True)
  
  # Tokenize text (consider NLTK or spaCy based on your needs)
  # ... (implementation using chosen library)
  
  return data


In [4]:
import nltk
import PyPDF2

# Download necessary NLTK resources (if not already downloaded)
nltk.download('punkt')

def tokenize_pdf(pdf_path):
  """
  Attempts to extract text from a PDF and perform tokenization using NLTK.

  **Limitations:**
  * This approach relies on PyPDF2 which might not handle complex PDFs well.
  * Consider using dedicated PDF extraction libraries for better results.

  Args:
      pdf_path (str): Path to the PDF file.

  Returns:
      list: List of tokens (words) extracted from the PDF.
  """
  try:
    # Open PDF with PyPDF2
    with open(pdf_path, 'rb') as pdf_file:
      pdf_reader = PyPDF2.PdfReader(pdf_file)

      # Extract text (might be inaccurate for complex PDFs)
      text = ""
      for page in pdf_reader.pages:
        text += page.extract_text()

      # Tokenize text using NLTK
      tokens = nltk.word_tokenize(text)
      return tokens
  except Exception as e:
    print(f"Error processing PDF: {e}")
    return []

# Example usage (replace with your PDF path)
pdf_path = "/home/haxck/Desktop/AI_Development_Platform/data"
tokens = tokenize_pdf(pdf_path)

if tokens:
  print("Extracted tokens:")
  print(tokens)
else:
  print("Failed to extract tokens from PDF.")


Error processing PDF: [Errno 21] Is a directory: '/home/haxck/Desktop/AI_Development_Platform/data'
Failed to extract tokens from PDF.


[nltk_data] Downloading package punkt to /home/haxck/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from transformers import TFBertModel, BertTokenizer  # Assuming you choose Bert-based LLM (Mistral-7B or Llama2))

# Load pre-trained model and tokenizer (replace with specific model names)
model_name = "bert-base-uncased"  # Replace with chosen LLM identifier
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

# Define your data pre-processing logic (using TensorFlow Text or other libraries)
# ... (your data cleaning and tokenization code)

# Define fine-tuning hyperparameters (learning rate, batch size, epochs)
learning_rate = ...
batch_size = ...
epochs = ...

# Define fine-tuning model (potentially with additional layers on top of the pre-trained model)
input_ids = Input(shape=(max_length,), dtype=tf.int32)
embeddings = model(input_ids)[0]  # Extract token embeddings
# Add additional layers if needed (e.g., for specific task)
output = Dense(num_classes, activation="softmax")(embeddings)  # Assuming classification task
model = tf.keras.Model(inputs=input_ids, outputs=output)

# Compile the model for training
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))

# Train the model on your pre-processed data
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

# Save the fine-tuned model (weights and configuration)
# ... (your model saving logic)


In [1]:
import os
import tensorflow as tf
from transformers import TFBertTokenizer, TFBertForSequenceClassification
from datasets import load_dataset
from fastapi import FastAPI, UploadFile, File
from sklearn.model_selection import train_test_split

# Initialize FastAPI
app = FastAPI()

# Function to load Mistral-7B model and tokenizer
def load_mistral_7b():
  """Loads Mistral-7B tokenizer and model from Hugging Face."""
  model_name = "allenai/mistral-base"  # Replace with actual Mistral-7B identifier (if different)
  tokenizer = TFBertTokenizer.from_pretrained(model_name)
  model = TFBertForSequenceClassification.from_pretrained(model_name)
  return tokenizer, model

# Initialize tokenizer and model (using the defined function)
tokenizer, model = load_mistral_7b()

# Data Upload and Preprocessing
@app.post("/upload/")
async def upload_file(file: UploadFile = File(...)):
  # Save the uploaded file
  with open(file.filename, "wb") as buffer:
    buffer.write(await file.read())

  # Load and preprocess the data (assuming CSV format)
  data = pd.read_csv(file.filename)
  data = clean_data(data)
  data_tokenized = tokenize_data(data, tokenizer)

  return {"message": "Data uploaded and preprocessed successfully"}

def clean_data(data):
  # Implement data cleaning logic here
  return data

def tokenize_data(data, tokenizer):
  # Implement tokenization logic using the tokenizer
  # ... (your data tokenization code)
  return data_tokenized

# Training and Fine-tuning
@app.post("/train/")
async def train_model():
  # Load preprocessed data (assuming stored after upload)
  data = pd.read_csv("preprocessed_data.csv")  # Replace with your data path
  labels = data["label_column"]  # Assuming a label column exists
  inputs = data.drop("label_column", axis=1)  # Assuming label column separation

  # Data splitting
  train_inputs, temp_data, train_labels, temp_labels = train_test_split(
      inputs, labels, test_size=0.3, random_state=42
  )
  val_inputs, test_inputs, val_labels, test_labels = train_test_split(
      temp_data, temp_labels, test_size=0.5, random_state=42
  )

  # Prepare datasets for training (TensorFlow format)
  train_encodings = tokenizer(
      train_inputs.to_list(), return_tensors="tf", padding="max_length", truncation=True
  )
  val_encodings = tokenizer(
      val_inputs.to_list(), return_tensors="tf", padding="max_length", truncation=True
  )
  train_dataset = tf.data.Dataset.from_tensor_slices((train_encodings["input_ids"], train_labels))
  val_dataset = tf.data.Dataset.from_tensor_slices((val_encodings["input_ids"], val_labels))

  # Define training parameters (can be adjusted)
  learning_rate = 2e-5
  batch_size = 8
  epochs = 3

  # Fine-tuning model (potentially with additional layers on top)
  # ... (model definition similar to previous examples using TensorFlow)

  # Compile the model
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
      metrics=["accuracy"],
  )

  # Early stopping (optional)
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)

  # Model checkpointing (optional)
  model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
      filepath="best_model.h5", save_best_only=True, monitor="val_loss"
  )

  # Train the model
  model.fit(
      train_dataset.batch(batch_size),
      epochs=epochs,
      validation_data=val_dataset.batch(batch_size),)


2024-04-13 05:45:25.670167: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'datasets'

In [4]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('I love programming!'))


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998701810836792}]
