In [1]:
pip install transformers datasets accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
file_path = '/Users/carolynyatco/decode_ai/get_weights/movies_training.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())

                                            Overview  Sentiment_Score  \
0  Two imprisoned men bond over a number of years...        -0.300000   
1  An organized crime dynasty's aging patriarch t...         0.000000   
2  When the menace known as the Joker wreaks havo...         0.333333   
3  The early life and career of Vito Corleone in ...         0.118182   
4  A jury holdout attempts to prevent a miscarria...         0.000000   

   Valence_Score  Arousal_Score  Dominance_Score     Tempo  
0       0.564455       0.362636         0.485636  0.018750  
1       0.534400       0.532500         0.667500  0.000000  
2       0.497643       0.602500         0.586929  0.066667  
3       0.684500       0.545200         0.602200  0.014336  
4       0.477333       0.558167         0.637167  0.000000  


In [4]:

from sklearn.preprocessing import MinMaxScaler

# Load CSV data

# Normalize sentiment scores
scaler = MinMaxScaler()
features = ["Sentiment_Score", "Valence_Score", "Arousal_Score", "Dominance_Score", "Tempo"]
df[features] = scaler.fit_transform(df[features])

# Define user weights for training
user_weights = {
    "Sentiment_Score": 0.25,
    "Valence_Score": 0.2,
    "Arousal_Score": 0.2,
    "Dominance_Score": 0.15,
    "Tempo": 0.2
}

# Format training prompts
def format_training_prompt(row, user_weights):
    # Apply user weights to sentiment scores
    weighted_features = {feature: row[feature] * weight for feature, weight in user_weights.items()}
    weighted_summary = ", ".join([f"{key}: {value:.2f}" for key, value in weighted_features.items()])
    
    # Construct the training prompt
    return (
        f"User Preferences: {weighted_summary}\n"
        f"Generate a creative commercial storyboard idea based on these preferences.\n"
        f"Storyboard (Training Target): {row['Overview']}"  # Overview is the target during training
    )

# Apply prompt formatting to the dataset
df["prompt"] = df.apply(lambda row: format_training_prompt(row, user_weights), axis=1)


In [5]:
from torch.utils.data import Dataset

class MovieDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.prompts = df["prompt"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.prompts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": encoded["input_ids"].squeeze()
        }


: 