In [67]:
import transformers
import datasets
import torch
import os #to do system stuff, like making folders
from transformers import Trainer, TrainingArguments
from accelerate import Accelerator

print(f"Transformers version: {transformers.__version__}")

print(f"Datasets version: {datasets.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"MPS Available: {torch.backends.mps.is_available()}")


Transformers version: 4.48.2
Datasets version: 3.2.0
Torch version: 2.6.0
MPS Available: True


In [68]:
# Make a folder, to hold .txt corpus
## only run this block one once! 
folder1 = "corpus"
# Create the folder in the current directory
os.makedirs(folder1, exist_ok=True)
print(f"Created folder: {folder1}")


Created folder: corpus


In [22]:
# if you want to use tinkyshakespeare text run this cell
import requests

# Download the Tiny Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
data = response.text

# Save to file
with open('corpus/tiny_shakespeare.txt', 'w') as f:
    f.write(data)


In [69]:
# if you want to use your own .txt files run this cell

# Define the path to your own text file
corpus = 'corpus/5Pages_roomsdescriptions.txt'  # Replace with the actual path to your text file

# Read the text file
with open(corpus, 'r') as file:
    data = file.read()

# Save to file (if needed)
output_txt = 'corpus/roomsD.txt'  # Replace with your desired output file path
with open(corpus, 'w') as f:
    f.write(data)

print("Text data read from file and saved successfully.")



Text data read from file and saved successfully.


In [71]:
# lets make sure our corpus was loaded and read successfully:
from datasets import Dataset

# Load data into a Hugging Face Dataset
raw_data = Dataset.from_dict({'text': [data]})

# Print the first 500 characters
print(raw_data['text'][0][:800])


Description for posting 3:
I'm looking for a roommate to share a 2b/2ba 1300 sqft condo near downtown Redmond. Pet rent is waived for cats!
Posted pictures are old. The interior furnishings look different now, but the walls/flooring are the same.
- Bedroom is 11'x12' and with sizable closet. Rent is $1600 a month (Lease would be up April 2026 with potential to renew)
- Rent includes water/sewer/garbage/wifi/renter's insurance
- 1 free parking spot for you
- 15 minute walk from Redmond Town Center, 7 minutes to a 545 stop, close to the 520.
- Very close to Microsoft/Google/Facebook
- Community hot tub and pool
- Recently renovated with wood floors, all new kitchen appliances, washer/dryer in unit, balcony
- I'm the only other person that will be living in the unit
- place is furnished, but 


In [72]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [73]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

tokenized_dataset = raw_data.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"]
)


Token indices sequence length is longer than the specified maximum sequence length for this model (67891 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 1/1 [00:00<00:00,  4.81 examples/s]


In [75]:
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated['input_ids'])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [concatenated[k][i:i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated.keys()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=1,
)


Map: 100%|██████████| 1/1 [00:00<00:00, 17.71 examples/s]


In [76]:
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated['input_ids'])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [concatenated[k][i:i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated.keys()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=4,
)


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.
Map: 100%|██████████| 1/1 [00:00<00:00, 15.88 examples/s]


In [77]:
# Save the dataset to disk
lm_dataset.save_to_disk('lm_dataset')


Saving the dataset (1/1 shards): 100%|██████████| 530/530 [00:00<00:00, 27281.53 examples/s]


In [None]:
#Training the GPT model

In [78]:
from datasets import load_from_disk
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load the dataset
lm_dataset = load_from_disk('lm_dataset')

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [79]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend")
else:
    device = torch.device("cpu")
    print("Using CPU")

model.to(device)


Using MPS backend


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [88]:
training_args = TrainingArguments(
    output_dir='./model_weights',
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy='steps',
    eval_steps=100,
    save_steps=500,
    logging_steps=50,
    learning_rate=5e-4,
    warmup_steps=100,
    save_total_limit=2,
    fp16=False,  # MPS backend currently doesn't support fp16 as of writing.
)




In [89]:
from transformers import DataCollatorForLanguageModeling

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    eval_dataset=lm_dataset,
    data_collator=data_collator,
)


In [None]:
##start training

In [90]:
### print("Starting training...")
trainer.train()
print("✅ Training complete. Saving model...")
trainer.save_model('./myLM_gpt2')
tokenizer.save_pretrained('./myLM')
print("✅ Model saved.")


Step,Training Loss,Validation Loss
100,0.3866,0.300419
200,0.62,0.431182
300,0.5137,0.391328
400,0.4447,0.331633
500,0.4419,0.248327
600,0.2189,0.196137
700,0.2323,0.159839
800,0.1917,0.104587
900,0.15,0.086019
1000,0.1107,0.063547


✅ Training complete. Saving model...
✅ Model saved.


In [91]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import sys

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('./myLM')
model = GPT2LMHeadModel.from_pretrained('./myLM_gpt2')

# Define the pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Update the model's configuration to recognize the pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Configure device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend")
else:
    device = torch.device("cpu")
    print("Using CPU")

model.to(device)
model.eval()

def generate_text(prompt, max_length=100):
    # Encode the prompt and generate attention_mask
    encoded = tokenizer(
        prompt,
        return_tensors='pt',
        padding=False,  # No padding needed for single inputs
        truncation=False,
        max_length=500  # Adjust based on model's max input length
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Pass the attention_mask
            max_length=max_length,
            num_beams=1,
            no_repeat_ngram_size=0, #Minimum value is 0 (no restriction). Higher values prevent repetitions of n-grams of that size
            early_stopping=True,
            temperature=1.8, #between 0.1 and 2.0 = higher = more gibberish
            top_p=1.0, #Between 0.0 and 1.0. Lower values restrict the model \ higher values allow for more diversity
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id  # Ensure pad_token_id is set
        )
    return tokenizer.decode(output[0], skip_special_tokens=False)

def main():
    while True:
        prompt = input("Enter a prompt (or type 'exit' to quit): ")
        if prompt.lower() == 'exit':
            break
        print(f"Prompt: {prompt}")
        print(generate_text(prompt)[len(prompt):], sep="")
        print("\n" + "-"*50 + "\n")

if __name__ == "__main__":
    if "ipykernel_launcher" in sys.argv[0]:
        main()
    else:
        # Running as a standalone script
        parser = argparse.ArgumentParser(description="Generate text based on a prompt.")
        parser.add_argument("--prompt", nargs="?", default=None, help="The prompt to generate text from.")
        args = parser.parse_args()
        main()


Using MPS backend


Enter a prompt (or type 'exit' to quit):  this is the home


Prompt: this is the home




Oasis), Quiet spaces where everything is 420 friendly, We are really just looking for friendly roommates that are friendly and clean. We have a lot of longevity with our housemates and housemates, so a home of sorts is not available on the weekends by night.
BEDROOM OF LIFE: 6 bedrooms, three 4 bath, washer bathroom, kitchen and dining area.
NOTE: This is a no smoking house, no drugs home, no parties or

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  friendly and clean


Prompt: friendly and clean

- calm and reflective
- responsible and not show animals
- non-smoking 
- no recreational drugs 
- serious about clean relationships
- employed professionals say plenty of service work in the animal care industry
5) Abundant street parking
6) Double parked cars
7) Walking distance to Highland Sammamish Plateau, Phinney. There is free street parking and a covered deck for professional motorcycles, motorcycles, and schnbaskets

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  motorcycles, and schnbaskets


Prompt: motorcycles, and schnbaskets
. Lots of kitchen space is available on 3rd and staining the floor of the house. For all 4 bedrooms: 6 beds, 1 shower, 1 frame, foodbasket, dishwasher. Microwave, refrigerator and oven. Garage work from home so cleaning up after yourself before you use any dishes in the house.
One bedroom on the main floor (shares 1 wall of closet). The living room and bathrooms are fully equipped,

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):   The living room and bathrooms are fully equipped


Prompt:  The living room and bathrooms are fully equipped
 so please donUse of any washer/dryer in the house unless discussed with other housemates. Must be able to pass a background check.
Location: Small neighborhood with very well lit backyard (2 miles) close to Lake City College Total 4405,5453,66 MSFT, PLU, South Lake Union, Ravenna, Seattle & Evergreen Redmond hospitals, Spanaway park, miles of trails, water, sewer, garbage, floors

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  Spanaway park, miles of trails, water, sewer, garbage, floors


Prompt: Spanaway park, miles of trails, water, sewer, garbage, floors
, heat, insulation, triple-pane windows, siding, bathrooms, kitchen with quartz countertop and pretty much everything except the foundation and studs / structure)
*
Includes brand new high-end appliances (Washer/Dryer, dishwasher, double-oven gas stove with overhead convection microwave/oven, french-door refridgerator) + other energy efficient upgrades (unlimited

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  refridgerator


Prompt: refridgerator
) that will make you happy to find your new partner.
If you're interested in talking with me, please respond to this email with your last employment status and intended moving date as well as your contact info and possibly frame for your next roommate. If this sounds like it could be a good fit for you, please respond to this email with your email address.
Thanks!

Description for posting 224:
I am looking for a new roommate to share my 2 bedroom

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  share my 2 bedrooms


Prompt: share my 2 bedrooms
. Looking for someone who is clean and enjoys a peaceful atmosphere. Rent is 1100 per month and I'm flexible when moving out, I have a way to go higher when I need it, going on schedule time.
Other Amenities:
• Immediate access to i405 and its own private light rail/several transit stations
• 5 min drive/several bus lines
• Washer/dryer on site 
• Small street parking as well
・

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  Rent is 1100 per month


Prompt: Rent is 1100 per month
 (month-to-month lease)  However, utilities can be paid at installment if needed. Rental references: A required background check (LOCATION: Seattle, City of Seattle, Shoreline, Issaquah) and a $500 rent deposit (equal to 1 month rent) 
Note: the above information, no smoking, drug, no pet, no  couples. 
Please contact agent Lisa at    
show contact info


--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  Please contact agent Lisa


Prompt: Please contact agent Lisa
 at 
show contact info

Description for posting 128:
ASE STOP FLAGGING THIS POST! IT IS LEGAL TO REVE A LIVING ON PHONE WITH A HARD W FOR ONE TENANT-  PERSON WITH A HARDWOOD FLAGGED BATHROOM.
-VE IN REQUIREMENT:  (1) must pay deposit, 1st month's rent, (Last month's rent)
1.3

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  (1) must pay deposit, 1st month's rent, (Last month's rent)


Prompt: (1) must pay deposit, 1st month's rent, (Last month's rent)
 refundable. Move in may be available within 2 months of move-in, depending on tenant credit and employment.
Walk-score=90: This is one of the most walkable neighborhoods in Seattle. The walk score of most of the other locations in CL is only 30-60. This is a dream location for working people. Everything is nearby so most errands are accomplished while completing the search

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  most errands are accomplished


Prompt: most errands are accomplished
). 
One room will be available on the first of January 15th:
Private bathroom for extra privacy. Share the kitchen and laundry with one other person. 
No additional pets. 
Parking on the property. 
Off street parking with mature trees
Fenced back garden
New appliances
Owner occupied only
Share utilities WIFI
Off street parking for one car per station
Credit background check
5 minute walk to Safeway on the

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  Parking on the property.  Off street parking with mature trees Fenced back garden New appliances Owner occupied only Share utilities WIFI Off street parking for one car per station Credit background check 5 minute walk to Safeway on the


Prompt: Parking on the property.  Off street parking with mature trees Fenced back garden New appliances Owner occupied only Share utilities WIFI Off street parking for one car per station Credit background check 5 minute walk to Safeway on the
 property
Owner occupied only by his partner
Monthly rental includes all utilities, 80 SQFT
Private bath, 75 SQFT
Lawnwalk to Children's Hospital,ARD,MCA,N.V. , and a covered deck forooterOCOL.

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  covered deck forooterOCOL.


Prompt: covered deck forooterOCOL.
  We have double kayaks, a fire pit, a large off-street parking space with an off-street parking option. Conveniently located near the intersection to North Tacoma PCC, Tacoma General Hospital, Kent Tukwila, and the Ashway Park Road to downtown Seattle.
The town is filled with beautiful natural light and a muted south-facing porches. Large, fenced-equiped kitchen, dining room are all brand new,

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  The town is filled with beautiful natural light and a muted south-facing porches. Large, fenced-equiped kitchen, dining room are all brand new,


Prompt: The town is filled with beautiful natural light and a muted south-facing porches. Large, fenced-equiped kitchen, dining room are all brand new,
 available now. Laundry in unit, kitchen and bath are all up-and-street; dishes are available for use.
The vibe is somewhere between quirky and punk, with lots of writing and art on the walls. There's even a street art listing on the wall that features a little more furniture than a regular refrigerator,

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  The vibe is somewhere between quirky and punk


Prompt: The vibe is somewhere between quirky and punk
, with lots of writing and art on the walls.
There are bookshelves, pans, trash bin, pots, pans and a communal deck with some communal storage.
There is an also a private deck with some yardwork on the patio that could make some interesting reading for an inexpensive fee. There is an also a gal that is a super-friendly pit bull named Salem.
What we are looking for: We all enjoy paddling, biking

--------------------------------------------------



Enter a prompt (or type 'exit' to quit):  exit


In [94]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import sys
import os

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('./myLM')
model = GPT2LMHeadModel.from_pretrained('./myLM_gpt2')

# Define the pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Update the model's configuration to recognize the pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Configure device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend")
else:
    device = torch.device("cpu")
    print("Using CPU")

model.to(device)
model.eval()

def generate_text(prompt, max_length=100, num_responses=10):
    responses = []
    encoded = tokenizer(
        prompt,
        return_tensors='pt',
        padding=False,
        truncation=False,
        max_length=500
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    for _ in range(num_responses):
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=1,
                no_repeat_ngram_size=0,
                early_stopping=True,
                temperature=1.8,
                top_p=1.0,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id
            )
        responses.append(tokenizer.decode(output[0], skip_special_tokens=True))
    
    return responses

def main():
    prompt = input("Enter a prompt: ")
    num_responses = 10  # Number of responses to generate
    responses = generate_text(prompt, num_responses=num_responses)
    
    # Specify the output file name
    output_file = 'responses/responses1.txt'
    
    # Append each response to .txt file
    with open(output_file, 'a') as file:
        for i, response in enumerate(responses):
            file.write(f"Response {i+1}:\n{response}\n\n")
            print(f"Response {i+1} appended to {output_file}")

if __name__ == "__main__":
    main()


Using MPS backend


Enter a prompt:  there is a siwming ppool with ample street parking by the shore


Response 1 appended to responses/responses1.txt
Response 2 appended to responses/responses1.txt
Response 3 appended to responses/responses1.txt
Response 4 appended to responses/responses1.txt
Response 5 appended to responses/responses1.txt
Response 6 appended to responses/responses1.txt
Response 7 appended to responses/responses1.txt
Response 8 appended to responses/responses1.txt
Response 9 appended to responses/responses1.txt
Response 10 appended to responses/responses1.txt
