In [2]:
from ollama import chat
import time
import pandas as pd

def generate_sentences(label, output_file, number, display=False): 
    response = chat(
        model='llama3.2',
        messages=[
            {'role': 'user', 'content': f'You are generating {number} sentences about booking a {label}.'}],
        stream=True,
    )
    with open(output_file, 'a') as f:
        for chunk in response:
            if display:
                print(chunk['message']['content'], end='', flush=True)
            else:
                f.write(chunk['message']['content'])


In [4]:
for _ in range(10):
    generate_sentences('car rental', 'car.txt', 100)

In [5]:
for _ in range(10):
    generate_sentences('flight', 'flight.txt', 100)

In [6]:
for _ in range(10):
    generate_sentences('hotel', 'hotel.txt', 100)

In [18]:
df = pd.DataFrame(columns=['sentence', 'label'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  0 non-null      object
 1   label     0 non-null      object
dtypes: object(2)
memory usage: 132.0+ bytes


In [19]:
with open('car.txt', 'r') as f:
    for line in f:
        sentence = line.split('. ')[1]
        df = pd.concat([df, pd.DataFrame(data={'sentence': [sentence], 'label': ['CarRental']})])

In [20]:
df.shape

(1910, 2)

In [21]:
with open('flight.txt', 'r') as f:
    for line in f:
        sentence = line.split('. ')[1]
        df = pd.concat([df, pd.DataFrame(data={'sentence': [sentence], 'label': ['Flight']})])

In [22]:
with open('hotel.txt', 'r') as f:
    for line in f:
        sentence = line.split('. ')[1]
        df = pd.concat([df, pd.DataFrame(data={'sentence': [sentence], 'label': ['Hotel']})])

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = list(df['sentence'])
embeddings = model.encode(sentences)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [24]:
df_embeddings = pd.DataFrame(data=embeddings)

In [29]:
df.shape

(5830, 2)

In [30]:
df_embeddings.shape

(5830, 384)

In [31]:
combined_df = pd.concat([df, df_embeddings])

In [32]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(combined_df, test_size=0.2, random_state=42, shuffle=True)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [33]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')
val_df.to_csv('val.csv')