# Imports

In [1]:
import os
import jsonlines
from uuid import uuid4
import pandas as pd

from datasets import load_dataset
from autotrain.dataset import AutoTrainDataset
from autotrain.project import AutoTrainProject

from tqdm.notebook import tqdm

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

True

## Datasets

In [2]:
dataset_name = 'ai-aerospace/ams_data_train_Llama-2-7B-Chat-GGUF-LLM-generic_100'
dataset=load_dataset(dataset_name)

In [3]:
dataset['train'][0]['text']

"### Human: What is the title of the symposium hosted by NASA Langley Research Center and Lockheed Martin Space Systems Company, as mentioned in the context?### Assistant:NASNCP-2006-2 14290  3 sth Aerospace Mechanisms Symposium  Compiled by  Edward A. Boesiger  Lockheed Martin Space Systems Company, Sunnyvale, California  Proceedings of a symposium hosted by  the NASA Langley Research Center and  Lockheed Martin Space Systems Company and  organized by the Mechanisms Education Association  held at the Williamsburg Maniott Hotel  Williamsburg, Virginia  May 17- 19,2006  May 2006 {'source': 'AMS_2006.pdf', 'page': 1}"

In [4]:
dataset['validation'][0]['text']

"### Human: What is the aerospace mechanisms symposia?### Assistant: An annual meeting of space mechanism experts. {'source': 'DM', 'page': 0}"

# Autotrain
https://github.com/huggingface/autotrain-advanced

In [6]:
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams_data_train-100_'+str(uuid4())

model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name='mistralai/Mistral-7B-v0.1'

## Using cli (more well documented)

I've taken this dataset and renamed into train.csv in the training folder. I can't figure out how to use autotrain with a different filename when using cli.

In [None]:
os.environ["project_name"] = project_name
os.environ["model_name"] = model_name
os.environ["repo_id"] = username+'/'+repo_name

In [None]:
!source ../.venv/bin/activate
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft

# The training dataset to be used must be called training.csv and be located in the data_path folder.
!autotrain llm --train \
    --project_name ${project_name} \
    --model ${model_name} \
    --data_path . \
    --use-peft \
    --learning_rate 2e-4 \
    --train_batch_size 6 \
    --num_train_epochs 3 \
    --trainer sft \
    --push_to_hub \
    --repo_id ${repo_id} \
    --token $HUGGINGFACE_TOKEN

## Now using python!

In [9]:
# Train
dataset_temp = dataset.copy()
train = dataset_temp['train']

# FILEPATH: /Users/danmueller/Documents/GitHub/aerospace_chatbot/training/train_llm_autotrain.ipynb
train_df = train.to_pandas()

# Validation. If empty, just creates an empty dataset. Needed to run autotrain.
try:
    validation = dataset_temp['validation']
    validation_df = validation.to_pandas()
except:
    validation_df = pd.DataFrame()
    validation_df['text'] = ''
    # Add validation item to the dataset
    dataset_temp['validation'] = validation_df


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 101
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [11]:
# # prepare dataset for AutoTrain
# dset = AutoTrainDataset(
#     train_data=[train_df],
#     valid_data=[validation_df],
#     task=TASK,
#     token=HUGGINGFACE_TOKEN,
#     project_name=project_name,
#     username=USERNAME,
#     column_mapping={"text": "text", "label": "label"},
#     percent_valid=None,
# )

# prepare dataset for AutoTrain
task = "lm_training"
dset = AutoTrainDataset(
    train_data=[train_df],
    valid_data=[validation_df],
    task=task,
    username=username,
    project_name=repo_name,
    token=os.environ['HUGGINGFACE_TOKEN'],
    percent_valid=None,
    column_mapping={"text": "text"}
)

> [1mINFO    Dataset: ams_data_train-100_db50c43f-2753-4dfa-9c0b-dc151dc83a52 (lm_training)
Train data: [                                                  text
0    ### Human: What is the title of the symposium ...
1    ### Human: What is the purpose of the NASA Sci...
2    ### Human: What is the name of the symposium h...
3    ### Human: What is the contact information for...
4    ### Human: What is the name of the organizatio...
..                                                 ...
96   ### Human: What is the cause of the failure me...
97   ### Human: What was identified as a root cause...
98   ### Human: What was the primary challenge of a...
99   ### Human: What is the cause of the failure en...
100  ### Human: What is the aerospace mechanisms sy...

[101 rows x 1 columns]]
Valid data: [                                                text
0  ### Human: What is the aerospace mechanisms sy...]
Column mapping: {'text': 'text'}
[0m


[                                                text
0  ### Human: What is the aerospace mechanisms sy...]


In [12]:
dset.prepare()

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/504 [00:00<?, ?B/s]

In [30]:
from autotrain.params import Params
import pprint
params = Params(task=task, param_choice='manual',model_choice='hub_model').get()
pprint.pprint(params) # to get full list of params for the task

{'gradient_accumulation_steps': <class 'autotrain.params.GradientAccumulationSteps'>,
 'hub_model': <class 'autotrain.params.HubModel'>,
 'learning_rate': <class 'autotrain.params.LMLearningRate'>,
 'lora_alpha': <class 'autotrain.params.LoraAlpha'>,
 'lora_dropout': <class 'autotrain.params.LoraDropout'>,
 'lora_r': <class 'autotrain.params.LoraR'>,
 'num_train_epochs': <class 'autotrain.params.LMEpochs'>,
 'optimizer': <class 'autotrain.params.Optimizer'>,
 'percentage_warmup': <class 'autotrain.params.PercentageWarmup'>,
 'scheduler': <class 'autotrain.params.Scheduler'>,
 'train_batch_size': <class 'autotrain.params.LMTrainBatchSize'>,
 'training_type': <class 'autotrain.params.LMTrainingType'>,
 'weight_decay': <class 'autotrain.params.WeightDecay'>}


In [33]:
#
# How to get params for a task:
#
# from autotrain.params import Params
# params = Params(task=TASK, training_type="hub_model").get()
# print(params) to get full list of params for the task

# define params in proper format
job1 = {
    "hub_model": model_name,
    "model_choice":"",
    "task": task,
    "learning_rate": 1e-5,
    "optimizer": "adamw_torch",
    "scheduler": "linear",
    "epochs": 5,
    "backend": 'CPU (Free)'
}

job2 = {
    "hub_model": model_name,
    "model_choice":"",
    "task": task,
    "learning_rate": 3e-5,
    "optimizer": "adamw_torch",
    "scheduler": "cosine",
    "epochs": 5,
    "backend": 'CPU (Free)'
}

job3 = {
    "hub_model": model_name,
    "model_choice":"",
    "task": task,
    "learning_rate": 5e-5,
    "optimizer": "sgd",
    "scheduler": "cosine",
    "epochs": 5,
    "backend": 'CPU (Free)'
}

jobs = pd.DataFrame([job1, job2, job3])

In [34]:
project = AutoTrainProject(dataset=dset, job_params=jobs)
project_id = project.create()
project.approve(project_id)

> [1mINFO    [{"hub_model":"mistralai\/Mistral-7B-v0.1","model_choice":"","task":"lm_training","learning_rate":0.00001,"optimizer":"adamw_torch","scheduler":"linear","epochs":5,"backend":"CPU (Free)"},{"hub_model":"mistralai\/Mistral-7B-v0.1","model_choice":"","task":"lm_training","learning_rate":0.00003,"optimizer":"adamw_torch","scheduler":"cosine","epochs":5,"backend":"CPU (Free)"},{"hub_model":"mistralai\/Mistral-7B-v0.1","model_choice":"","task":"lm_training","learning_rate":0.00005,"optimizer":"sgd","scheduler":"cosine","epochs":5,"backend":"CPU (Free)"}][0m
> [1mINFO    Creating Space for job: 0[0m
> [1mINFO    Using params: {'model': '', 'data_path': 'ai-aerospace/autotrain-data-ams_data_train-100_db50c43f-2753-4dfa-9c0b-dc151dc83a52', 'project_name': 'ams_data_train-100_db50c43f-2753-4dfa-9c0b-dc151dc83a52-0', 'train_split': 'train', 'valid_split': None, 'text_column': 'autotrain_text', 'rejected_text_column': 'autotrain_rejected_text', 'token': '*****', 'lr': 3e-05, 'epo

AttributeError: 'AutoTrainProject' object has no attribute 'approve'