In [1]:
!pip install transformers datasets torch mlflow flask streamlit apache-airflow pandas docker

Collecting mlflow
  Downloading mlflow-2.16.2-py3-none-any.whl (26.7 MB)
     ---------------------------------------- 0.0/26.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/26.7 MB 3.6 MB/s eta 0:00:08
      --------------------------------------- 0.4/26.7 MB 4.6 MB/s eta 0:00:06
      --------------------------------------- 0.5/26.7 MB 3.8 MB/s eta 0:00:07
     - -------------------------------------- 0.7/26.7 MB 4.0 MB/s eta 0:00:07
     - -------------------------------------- 0.8/26.7 MB 3.6 MB/s eta 0:00:08
     - -------------------------------------- 0.9/26.7 MB 3.3 MB/s eta 0:00:08
     - -------------------------------------- 1.0/26.7 MB 3.2 MB/s eta 0:00:08
     - -------------------------------------- 1.0/26.7 MB 2.7 MB/s eta 0:00:10
     - -------------------------------------- 1.1/26.7 MB 2.6 MB/s eta 0:00:10
     - -------------------------------------- 1.2/26.7 MB 2.7 MB/s eta 0:00:10
     -- ------------------------------------- 1.4/26.7 MB 2.8 MB/

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.41 requires requests_mock, which is not installed.
tensorflow 2.10.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.25.5 which is incompatible.
tensorflow-intel 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.10.0 which is incompatible.
tensorflow-intel 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.3 which is incompatible.
tensorflow-intel 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.10.1 which is incompatible.
tensorflow-intel 2.12.0 requires tensorflow-estimator<2.13,>=2.12.0, but you have tensorflow-estimator 2.10.0 which is incompatible.
tensorboard 2.10.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.25.5 which is incompatible.
s3fs 2023.4.0 requires fsspec==2023.4.0, but you have fsspec 2024.6.1 which is incompatible.
python-

In [2]:
%%writefile finetune.py

import mlflow
import torch
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

# Load dataset and model
dataset = load_dataset('yelp_review_full')
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Fine-tuning settings
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# MLflow tracking
mlflow.set_experiment("LLM_Finetuning_Experiment")

# Trainer and training function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

with mlflow.start_run():
    trainer.train()
    mlflow.log_metric("eval_accuracy", trainer.evaluate()["eval_accuracy"])


Writing finetune.py


In [3]:
%%writefile airflow_dag.py

from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import mlflow

# Default args for Airflow
default_args = {
    'owner': 'airflow',
    'start_date': datetime(2024, 1, 1),
    'retries': 1
}

dag = DAG('text_processing_pipeline', default_args=default_args, schedule_interval='@daily')

# Tokenize function
def tokenize_data():
    dataset = load_dataset('yelp_review_full')
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk('/tmp/tokenized_data')

# Fine-tune LLM
def fine_tune_model():
    dataset = load_from_disk('/tmp/tokenized_data')
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
    
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    mlflow.set_experiment("LLM_Finetuning_Airflow")
    with mlflow.start_run():
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['test']
        )
        trainer.train()
        mlflow.log_param("epochs", 3)
        mlflow.log_metric("eval_accuracy", trainer.evaluate()["eval_accuracy"])

# Define Airflow tasks
tokenize_task = PythonOperator(
    task_id='tokenize_task',
    python_callable=tokenize_data,
    dag=dag
)

fine_tune_task = PythonOperator(
    task_id='fine_tune_task',
    python_callable=fine_tune_model,
    dag=dag
)

# Set task dependencies
tokenize_task >> fine_tune_task

Writing airflow_dag.py


In [6]:
%%writefile app.py

from flask import Flask, jsonify
from transformers import pipeline

app = Flask(__name__)

# Load the fine-tuned model
model = pipeline("text-classification", model="./results")

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    text = data["text"]
    prediction = model(text)
    return jsonify(prediction)

@app.route("/metrics", methods=["GET"])
def get_metrics():
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name("LLM_Finetuning_Experiment")
    runs = client.search_runs(experiment.experiment_id)
    metrics = [{"run_id": run.info.run_id, "accuracy": run.data.metrics['eval_accuracy']} for run in runs]
    return jsonify(metrics)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5001)

Overwriting app.py


In [10]:
%%writefile Dockerfile

# Base image
FROM python:3.9

# Set environment variables
ENV AIRFLOW_HOME=/usr/local/airflow

# Install dependencies
RUN apt-get update && apt-get install -y \
    python3-pip \
    && pip3 install --upgrade pip \
    && pip3 install apache-airflow \
                   mlflow \
                   flask \
                   transformers \
                   datasets \
                   torch

# Copy the necessary files
COPY finetune.py /usr/local/app/finetune.py
COPY airflow_dag.py /usr/local/airflow/dags/airflow_dag.py
COPY app.py /usr/local/app/app.py

# Set working directory
WORKDIR /usr/local/app

# Expose ports for Flask (5000) and Airflow (8080)
EXPOSE 5001 8081
RUN airflow db init
# Command to start Flask and Airflow
CMD flask run --host=0.0.0.0 --port=5001 & \
    airflow webserver --port 8081

Overwriting Dockerfile


In [8]:
pip install apache-airflow==2.10.2




In [15]:
%%writefile Dockerfile
# Base image
FROM python:3.9

# Set working directory
WORKDIR /app

# Install dependencies
RUN apt-get update && apt-get install -y python3-pip && pip3 install --upgrade pip \
    && pip3 install transformers torch mlflow accelerate flask datasets

# Copy the fine-tuning and API scripts
COPY finetune.py /app/finetune.py
COPY app.py /app/app.py

# Expose Flask port
EXPOSE 5000

# Run the fine-tuning script first, then start Flask
CMD python3 finetune.py && python3 app.py

Overwriting Dockerfile


In [21]:
%%writefile Dockerfile

# Use the official PyTorch image, which already includes PyTorch and CUDA (if needed)
FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime

# Set working directory
WORKDIR /app

# Install the remaining dependencies
RUN pip install --upgrade pip \
    && pip install transformers[torch] accelerate mlflow flask datasets

RUN pip install scikit-learn
# Copy your fine-tuning and Flask scripts
COPY finetune.py /app/finetune.py
COPY app.py /app/app.py

# Expose Flask port
EXPOSE 5001

# Run the fine-tuning script first, then start Flask
CMD python3 finetune.py && python3 app.py

Overwriting Dockerfile


In [20]:
%%writefile finetune.py

import mlflow
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, load_metric

# Load dataset and limit to 100 samples
dataset = load_dataset('yelp_review_full', split='train[:100]')

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Load accuracy metric
accuracy_metric = load_metric('accuracy')

# Compute accuracy metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Limiting to 1 epoch
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics  # Pass the accuracy computation function
)

# Start fine-tuning and log metrics to MLflow
mlflow.set_experiment("fine-tuning-small")
with mlflow.start_run():
    trainer.train()
    metrics = trainer.evaluate()
    mlflow.log_metrics({"accuracy": metrics['accuracy']})  # Logging accuracy metric
    mlflow.log_param("num_samples", 100)

# Save the fine-tuned model
model.save_pretrained('./results')

Overwriting finetune.py


In [26]:
!pip install protobuf==3.20.





In [19]:
%%writefile app.py

from flask import Flask, jsonify, request
from transformers import pipeline
import mlflow

app = Flask(__name__)

# Load the fine-tuned model
model = pipeline("text-classification", model="./results")

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    text = data['text']
    prediction = model(text)
    return jsonify(prediction)

@app.route("/metrics", methods=["GET"])
def get_metrics():
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name("fine-tuning-small")
    runs = client.search_runs(experiment.experiment_id)
    metrics = [{"run_id": run.info.run_id, "accuracy": run.data.metrics['accuracy']} for run in runs]
    return jsonify(metrics)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5001)

Overwriting app.py


In [4]:
!python3 -m pip install pip --upgrade
!pip install pyopenssl --upgrade

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


Collecting pyopenssl
  Downloading pyOpenSSL-24.2.1-py3-none-any.whl (58 kB)
     ---------------------------------------- 0.0/58.4 kB ? eta -:--:--
     ---------------------------------------- 58.4/58.4 kB 3.0 MB/s eta 0:00:00
Installing collected packages: pyopenssl
  Attempting uninstall: pyopenssl
    Found existing installation: pyOpenSSL 23.0.0
    Uninstalling pyOpenSSL-23.0.0:
      Successfully uninstalled pyOpenSSL-23.0.0
Successfully installed pyopenssl-24.2.1




In [5]:
pip install -U accelerate

Collecting accelerate
  Using cached accelerate-1.0.0-py3-none-any.whl (330 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.0.0
Note: you may need to restart the kernel to use updated packages.




In [5]:
pip install -U datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
     ---------------------------------------- 0.0/471.6 kB ? eta -:--:--
      --------------------------------------- 10.2/471.6 kB ? eta -:--:--
     --- --------------------------------- 41.0/471.6 kB 330.3 kB/s eta 0:00:02
     -------------- ----------------------- 174.1/471.6 kB 1.3 MB/s eta 0:00:01
     ------------------ ------------------- 225.3/471.6 kB 1.5 MB/s eta 0:00:01
     ----------------------- -------------- 286.7/471.6 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 471.6/471.6 kB 1.7 MB/s eta 0:00:00
Collecting pyarrow>=15.0.0
  Downloading pyarrow-17.0.0-cp39-cp39-win_amd64.whl (25.1 MB)
     ---------------------------------------- 0.0/25.1 MB ? eta -:--:--
     ---------------------------------------- 0.2/25.1 MB 6.3 MB/s eta 0:00:04
     ---------------------------------------- 0.2/25.1 MB 6.9 MB/s eta 0:00:04
     ---------------------------------------- 0.2/25.1 MB 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 4.37.2 requires urllib3~=2.0, but you have urllib3 1.26.20 which is incompatible.


In [2]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer

In [5]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
     ---------------------------------------- 0.0/84.0 kB ? eta -:--:--
     ---------------------------------------- 84.0/84.0 kB 2.4 MB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3




In [3]:
import mlflow

from datasets import load_dataset
from evaluate import load as load_metric  # Updated import from the `evaluate` library


# Load dataset and limit to 100 samples
dataset = load_dataset('yelp_review_full', split='train[:100]')

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Load accuracy metric
accuracy_metric = load_metric('accuracy')

# Compute accuracy metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Limiting to 1 epoch
    weight_decay=0.01,
    logging_dir='./logs',
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.we

In [6]:
tokenized_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [4]:


# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics  # Pass the accuracy computation function
)

# Start fine-tuning and log metrics to MLflow


2024/10/09 11:20:26 INFO mlflow.tracking.fluent: Experiment with name 'fine-tuning-small' does not exist. Creating a new experiment.


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [7]:
mlflow.set_experiment("fine-tuning-small")
with mlflow.start_run():
    metrics = trainer.evaluate()
    mlflow.log_metrics({"accuracy": metrics['accuracy']})  # Logging accuracy metric
    mlflow.log_param("num_samples", 100)

# Save the fine-tuned model
model.save_pretrained('./results')

KeyboardInterrupt: 

In [8]:
metrics = trainer.evaluate()

AttributeError: 'NotebookTrainingTracker' object has no attribute 'value'

In [None]:
metrics

In [None]:
pip install requests