In [1]:
import pandas as pd
import sqlite3


In [2]:
def preprocess_data(df):
    # Remove duplicate messages
    df = df.drop_duplicates()

    # Normalize text: lowercase
    df["text"] = df["text"].str.lower()

    # Remove special characters (optional)
    df["text"] = df["text"].str.replace(r"[^\w\s]", "", regex=True)

    df

    # Structure data for chatbot (input-response pairs)
    conversations = []
    for i in range(len(df) - 1):
        if df.iloc[i]["sender"] == "You" and df.iloc[i + 1]["sender"] == "Me":
            conversations.append({"input": df.iloc[i]["text"], "response": df.iloc[i + 1]["text"]})

    # Convert to DataFrame
    chatbot_data = pd.DataFrame(conversations)

    # Save processed data for chatbot training
    chatbot_data.to_csv("../data/chatbot_data.csv", index=False)

    print("Preprocessing complete! Saved as chatbot_data.csv")

In [3]:
with sqlite3.connect("../data/texts.db") as conn:
    query = '''
    SELECT m.text, m.date, h.id as sender
    FROM message m
    JOIN handle h ON m.handle_id = h.ROWID
    '''
    # df_texts = pd.read_sql_query(query, conn)
    # df_texts.to_csv('texts.csv')  # Export to CSV

    query_myself = '''
    SELECT 
    CASE 
        WHEN "is_from_me" = 0 THEN 'You'
        WHEN "is_from_me" = 1 THEN 'Me'
    END as sender,
    "text"
    FROM "main"."message"
    WHERE length(text) > 1
    AND ROWID > 100000
    AND handle_id IS NOT 0
    AND handle_id IN (
        SELECT handle_id
        FROM "main"."message"
        GROUP BY handle_id
        HAVING COUNT(*) >= 100
    )
    ORDER BY "handle_id", "ROWID" ASC
    LIMIT 0, 499999;
    '''

    df_my_texts = pd.read_sql_query(query_myself, conn)
    df_my_texts
    preprocess_data(df_my_texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["text"].str.replace(r"[^\w\s]", "", regex=True)


Preprocessing complete! Saved as chatbot_data.csv


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

df = pd.read_csv("../data/chatbot_data.csv")

def format_dialog(df):
    conversations = []
    for _, row in df.iterrows():
        conversations.append(f"User: {row['input']}\nBot: {row['response']}")
    return conversations

formatted_texts = format_dialog(df)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from sklearn.model_selection import train_test_split

# Split into train and eval datasets
train_texts, eval_texts = train_test_split(formatted_texts, test_size=0.1)  # 10% for evaluation

In [9]:
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Tokenize the training and evaluation datasets
train_inputs = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
eval_inputs = tokenizer(eval_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Create datasets
train_dataset = Dataset.from_dict(train_inputs)
eval_dataset = Dataset.from_dict(eval_inputs)

class ChatDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings.input_ids)
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = ChatDataset(train_dataset)

training_args = TrainingArguments(
    output_dir="./models/charbot_model",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="epoch"
)




In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
model.save_pretrained("models/charbot_model_small")
tokenizer.save_pretrained("models/charbot_model_small")

AttributeError: 'Dataset' object has no attribute 'input_ids'