<a href="https://colab.research.google.com/github/charannn/Machine-Learning-Algorithms/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
data_path = '/content/drive/My Drive/'

In [3]:
import pandas as pd
file1 = data_path + 'ifood_df.csv'
df = pd.read_csv(file1)
df.head()

Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,marital_Together,marital_Widow,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,MntTotal,MntRegularProds,AcceptedCmpOverall
0,58138.0,0,0,58,635,88,546,172,88,88,...,0,0,0,0,1,0,0,1529,1441,0
1,46344.0,1,1,38,11,1,6,2,1,6,...,0,0,0,0,1,0,0,21,15,0
2,71613.0,0,0,26,426,49,127,111,21,42,...,1,0,0,0,1,0,0,734,692,0
3,26646.0,1,0,26,11,4,20,10,3,5,...,1,0,0,0,1,0,0,48,43,0
4,58293.0,1,0,94,173,43,118,46,27,15,...,0,0,0,0,0,0,1,407,392,0


In [5]:
import pandas as pd

def create_textual_description(row):
    description = f"This customer has an income of {row['Income']:.0f}."
    if row['Kidhome'] > 0:
        description += f" They have {int(row['Kidhome'])} children at home."
    if row['Teenhome'] > 0:
        description += f" They have {int(row['Teenhome'])} teenagers at home."

    # Check if the columns exist before accessing them
    if 'marital_status' in row.index:
        marital_status = ''
        for col in row.index:
            if col.startswith('marital_') and row[col] == 1:
                marital_status = col[8:]  # Extract status after "marital_"
                break
        description += f" They are {marital_status}."
    else:
        description += " Marital status information is missing." # Handle the case where the column is missing


    if 'education' in row.index:
        education_level = ''
        for col in row.index:
            if col.startswith('education_') and row[col] == 1:
                education_level = col[10:]  # Extract level after "education_"
                break
        description += f" They have a {education_level} education."
    else:
        description += " Education information is missing." # Handle the case where the column is missing

    description += f" They have made web purchases {int(row['NumWebPurchases'])} times and store purchases {int(row['NumStorePurchases'])} times."

    description += f" Their recency is {int(row['Recency'])} days."

    return description


# Handle NaNs first (important!)
df = df.fillna(df.mean())  # Fill NaNs with column means

# One-hot encode marital status and education only if they exist
# Check if the columns exist before applying pd.get_dummies
if 'marital_status' in df.columns and 'education' in df.columns:
    df = pd.get_dummies(df, columns=['marital_status', 'education'])

df['text_description'] = df.apply(create_textual_description, axis=1)


print(df['text_description'].head().to_list()) #Check to see results

['This customer has an income of 58138. Marital status information is missing. Education information is missing. They have made web purchases 8 times and store purchases 4 times. Their recency is 58 days.', 'This customer has an income of 46344. They have 1 children at home. They have 1 teenagers at home. Marital status information is missing. Education information is missing. They have made web purchases 1 times and store purchases 2 times. Their recency is 38 days.', 'This customer has an income of 71613. Marital status information is missing. Education information is missing. They have made web purchases 8 times and store purchases 10 times. Their recency is 26 days.', 'This customer has an income of 26646. They have 1 children at home. Marital status information is missing. Education information is missing. They have made web purchases 2 times and store purchases 4 times. Their recency is 26 days.', 'This customer has an income of 58293. They have 1 children at home. Marital status

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load Data and Create Textual Descriptions (as above)
#    ... (Include code from the previous step)

# Prepare data for language model training
texts = df['text_description'].tolist()
train_texts, val_texts = train_test_split(texts, test_size=0.2, random_state=42)

# 2. Load Pre-trained Model and Tokenizer
model_name = "gpt2"  # Or "EleutherAI/gpt-neo-125M" for smaller model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:  # GPT2 doesn't have a pad token by default
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# 3. Tokenize the Data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# 4. Create PyTorch Datasets
import torch

class MarketingDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = MarketingDataset(train_encodings)
val_dataset = MarketingDataset(val_encodings)

# 5. Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=3,       # Number of training epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
)

# 6. Create Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
    tokenizer=tokenizer,
    data_collator=lambda data: {
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.stack([f['input_ids'] for f in data])  # Labels are same as input for LM
    }
)

# 7. Train the Model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwarehousefestive[0m ([33mwarehousefestive-rundata-ltd[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
50,3.0383,2.600626
100,0.5531,0.523082
150,0.4624,0.458367


Step,Training Loss,Validation Loss
50,3.0383,2.600626
100,0.5531,0.523082
150,0.4624,0.458367
200,0.4499,0.424818
250,0.4301,0.412378
300,0.4391,0.411738
350,0.4279,0.410635
400,0.4184,0.408938
450,0.4233,0.408699
500,0.4095,0.41235


TrainOutput(global_step=663, training_loss=0.7818904145990202, metrics={'train_runtime': 7225.9395, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.092, 'total_flos': 151239094272000.0, 'train_loss': 0.7818904145990202, 'epoch': 3.0})

In [12]:
# Example Inference
def generate_persona(customer_data):  # customer_data: a dictionary of customer characteristics
    # customer_data is already a dictionary; no need to convert again.
    description = create_textual_description(pd.Series(customer_data)) # Convert to pandas Series
    input_ids = tokenizer.encode(description, return_tensors='pt')  # Encode the input

    # Generate text
    output = model.generate(input_ids,
                              max_length=200,  # Max length of generated text
                              num_beams=5,       # Beam search for better generation
                              no_repeat_ngram_size=2, # avoid repeating phrases
                              early_stopping=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)  # Decode output
    return generated_text

# Example usage with one row from your dataframe
sample_customer = df.iloc[1900].to_dict()  #Convert to a dict, also ensures datatypes are correct.
persona = generate_persona(sample_customer)
print(persona)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This customer has an income of 55212. They have 1 teenagers at home. Marital status information is missing. Education information is missing. They have made web purchases 4 times and store purchases 11 times. Their recency is 65 days.
