<a href="https://colab.research.google.com/github/dalalelamine/AISD/blob/main/phase1_TwoTowerV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving ratings.csv to ratings.csv


In [None]:
import pandas as pd

# Load datasets
movies = pd.read_csv("movies.csv")      # Contains movieId, title, genres
ratings = pd.read_csv("ratings.csv")    # Contains userId, movieId, rating, timestamp

# Count the number of ratings per movie
movie_counts = ratings['movieId'].value_counts()

# Filter movieIds with more than 50 reviews
popular_movie_ids = movie_counts[movie_counts > 50].index

# Filter movies dataframe to keep only popular ones
filtered_movies = movies[movies['movieId'].isin(popular_movie_ids)]

# Optionally, filter ratings too to match only the filtered movies
filtered_ratings = ratings[ratings['movieId'].isin(popular_movie_ids)]

# Display some filtered movies
print(filtered_movies.head())
print(filtered_ratings.head())


   movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   
5        6              Heat (1995)   
6        7           Sabrina (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
5                        Action|Crime|Thriller  
6                               Comedy|Romance  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [None]:
# From Step 1: filtered_ratings already includes only ratings for popular movies
# Now get the list of users who rated those movies
filtered_user_ids = filtered_ratings['userId'].unique()

# If you have a separate user dataset, you can filter it like this:
# users = pd.read_csv("users.csv")
# filtered_users = users[users['userId'].isin(filtered_user_ids)]

# If not, we can just proceed with filtered_user_ids
print(f"Number of users who rated popular movies: {len(filtered_user_ids)}")

# Optionally preview the filtered ratings
print(filtered_ratings.head())

Number of users who rated popular movies: 606
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd

# 1. Fixed User Tower with proper dimensions
def create_user_tower(filtered_ratings, filtered_movies):
    """
    Create a user tower that properly handles string inputs with movie titles.
    """
    # Merge ratings with movies to get titles
    ratings_with_movies = filtered_ratings.merge(filtered_movies, on="movieId")

    # Add liked column
    ratings_with_movies.loc[:, "liked"] = (ratings_with_movies["rating"] >= 3).astype(int)

    # Keep only liked movies
    liked_movies = ratings_with_movies[ratings_with_movies["liked"] == 1]

    # Create a mapping from user IDs to their liked movie titles
    user_movie_map = {}
    for user_id, group in liked_movies.groupby('userId'):
        user_movie_map[str(user_id)] = ' '.join(group['title'].values)

    # Create text vectorization layer - with specific output sequence length
    text_vectorizer = layers.TextVectorization(max_tokens=10000, output_sequence_length=20)

    # Adapt the text vectorizer to the user movie preferences data
    user_texts = np.array(list(user_movie_map.values()))
    if len(user_texts) == 0:
        # Handle empty user_texts case
        user_texts = np.array(["placeholder text"])
    text_vectorizer.adapt(user_texts)

    # Create the User Tower with fixed dimensions
    user_model = tf.keras.Sequential([
        text_vectorizer,  # Output shape: (batch_size, 20)
        layers.Embedding(input_dim=10000, output_dim=64),  # Output shape: (batch_size, 20, 64)
        # Use GlobalAveragePooling1D instead of Flatten to handle variable length sequences
        layers.GlobalAveragePooling1D(),  # Output shape: (batch_size, 64)
        layers.Dense(64, activation='relu')  # Output shape: (batch_size, 64)
    ])

    print("User Tower:")
    user_model.summary()

    return user_model, user_movie_map

# 2. Fixed Movie Tower with correct dimensions
def create_movie_tower(filtered_movies):
    """
    Create an improved movie tower with proper handling of dimensions.
    """
    # Combine movie titles and genres
    filtered_movies['title_genre'] = filtered_movies['title'] + " " + filtered_movies['genres']

    # Create text vectorization layer - with specific output sequence length
    text_vectorizer = layers.TextVectorization(max_tokens=10000, output_sequence_length=20)

    # Adapt the text vectorizer to the movie data
    movie_texts = np.array(filtered_movies['title_genre'].values)
    text_vectorizer.adapt(movie_texts)

    # Create the Movie Tower with fixed dimensions
    movie_model = tf.keras.Sequential([
        text_vectorizer,  # Output shape: (batch_size, 20)
        layers.Embedding(input_dim=10000, output_dim=64),  # Output shape: (batch_size, 20, 64)
        # Use GlobalAveragePooling1D instead of Flatten to handle variable length sequences
        layers.GlobalAveragePooling1D(),  # Output shape: (batch_size, 64)
        layers.Dense(64, activation='relu')  # Output shape: (batch_size, 64)
    ])

    print("Movie Tower:")
    movie_model.summary()

    return movie_model

In [None]:
# 3. Improved Two-Tower Model with proper handling of dimensions
class ImprovedMovieLensTwoTowerModel(tf.keras.Model):
    def __init__(self, user_model, movie_model, similarity_metric='dot'):
        super(ImprovedMovieLensTwoTowerModel, self).__init__()
        self.user_model = user_model
        self.movie_model = movie_model
        self.similarity_metric = similarity_metric

    def call(self, inputs):
        user_input, movie_input = inputs

        # Convert inputs to string if they're not already
        user_input = tf.cast(user_input, tf.string)
        movie_input = tf.cast(movie_input, tf.string)

        # Get embeddings for user and movie
        user_embedding = self.user_model(user_input)  # Shape: (batch_size, 64)
        movie_embedding = self.movie_model(movie_input)  # Shape: (batch_size, 64)

        # Calculate similarity (dot product or cosine)
        if self.similarity_metric == 'cosine':
            # Normalize embeddings for cosine similarity
            user_embedding = tf.nn.l2_normalize(user_embedding, axis=1)
            movie_embedding = tf.nn.l2_normalize(movie_embedding, axis=1)

        # Compute dot product between user and movie embeddings
        similarity = tf.reduce_sum(tf.multiply(user_embedding, movie_embedding), axis=1)

        # Convert to probability using sigmoid
        output = tf.nn.sigmoid(similarity)

        # Reshape to match expected output shape (batch_size, 1)
        output = tf.reshape(output, [-1, 1])

        return output

    # Add a method to get embeddings specifically for recommendations
    def get_embeddings(self, inputs):
        user_input, movie_input = inputs

        # Convert inputs to string
        user_input = tf.cast(user_input, tf.string)
        movie_input = tf.cast(movie_input, tf.string)

        # Get embeddings
        user_embedding = self.user_model(user_input)
        movie_embedding = self.movie_model(movie_input)

        # Normalize if using cosine similarity
        if self.similarity_metric == 'cosine':
            user_embedding = tf.nn.l2_normalize(user_embedding, axis=1)
            movie_embedding = tf.nn.l2_normalize(movie_embedding, axis=1)

        return user_embedding, movie_embedding

# 4. Training function
def train_model(user_model, movie_model, train_dataset, val_dataset, similarity_metric='dot'):
    """
    Create and train the two-tower model.
    """
    # Create the model
    model = ImprovedMovieLensTwoTowerModel(user_model, movie_model, similarity_metric)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )

    # Train the model
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=5,
        verbose=1
    )





    return model, history

# 5. Recommendation function
def get_movie_recommendations(model, user_id, filtered_movies, user_movie_map, filtered_ratings, top_n=10):
    """
    Get personalized movie recommendations using similarity-based scoring.
    """
    # Check if we have movie preferences for this user
    if str(user_id) not in user_movie_map:
        print(f"No movie preferences found for user {user_id}. Using default recommendations.")
        # Return popular movies as fallback
        if 'popularity' in filtered_movies.columns:
            return filtered_movies.sort_values('popularity', ascending=False).head(top_n)
        else:
            # If popularity column doesn't exist, just return first N movies
            return filtered_movies.head(top_n)

    # Get user's liked movie titles
    user_input = np.array([user_movie_map[str(user_id)]])

    # Create batch for all movies
    movie_inputs = filtered_movies['title_genre'].values

    # Process in batches to avoid memory issues
    batch_size = 500
    num_batches = len(movie_inputs) // batch_size + (1 if len(movie_inputs) % batch_size > 0 else 0)

    all_scores = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(movie_inputs))

        batch_movie_inputs = movie_inputs[start_idx:end_idx]

        # Get embeddings
        user_embedding, movie_embeddings = model.get_embeddings((
            np.repeat(user_input, len(batch_movie_inputs), axis=0),
            np.array(batch_movie_inputs)
        ))

        # Calculate similarity scores
        if model.similarity_metric == 'cosine':
            # For cosine, both embeddings are already normalized
            batch_scores = tf.reduce_sum(user_embedding * movie_embeddings, axis=1).numpy()
        else:
            # For dot product
            batch_scores = tf.reduce_sum(user_embedding * movie_embeddings, axis=1).numpy()

        all_scores.extend(batch_scores)

    # Create a dataframe with the predictions
    recommendations = pd.DataFrame({
        'movieId': filtered_movies['movieId'].values,
        'title': filtered_movies['title'].values,
        'genres': filtered_movies['genres'].values,
        'score': all_scores
    })

    # Get user's already liked movies to filter them out
    user_liked_movies = set()
    for _, group in filtered_ratings[filtered_ratings['userId'] == user_id].iterrows():
        if group['rating'] >= 3:  # Only count movies they liked
            user_liked_movies.add(group['movieId'])

    # Filter out movies the user has already liked
    new_recommendations = recommendations[~recommendations['movieId'].isin(user_liked_movies)]

    # Sort by score in descending order and get top_n recommendations
    top_recommendations = new_recommendations.sort_values('score', ascending=False).head(top_n)

    return top_recommendations

# 6. Data preparation function
def prepare_training_data(filtered_ratings, filtered_movies, user_movie_map):
    """
    Prepare the training data with proper formatting.
    """
    # Combine title and genres for movie tower if not already done
    if 'title_genre' not in filtered_movies.columns:
        filtered_movies['title_genre'] = filtered_movies['title'] + " " + filtered_movies['genres']

    # Merge ratings with movies to get title_genre
    ratings_with_movies = filtered_ratings.merge(filtered_movies, on="movieId")

    # Add liked column
    ratings_with_movies.loc[:, "liked"] = (ratings_with_movies["rating"] >= 3).astype(int)

    # Create training samples
    training_samples = []

    for _, row in ratings_with_movies.iterrows():
        user_id = str(row['userId'])
        movie_title_genre = row['title_genre']
        liked = row['liked']

        # Only include users that have a movie map (liked at least one movie)
        if user_id in user_movie_map:
            training_samples.append({
                'user_input': user_movie_map[user_id],  # All movies the user liked
                'movie_input': movie_title_genre,       # Current movie being evaluated
                'label': liked                          # Whether they liked this movie
            })

    # Convert to DataFrame
    return pd.DataFrame(training_samples)

In [None]:
# 7. Main function with additional error handling and check functions
def main():
    try:
        # Load your data here - placeholder for actual data loading
        # filtered_ratings = pd.read_csv('ratings.csv')
        # filtered_movies = pd.read_csv('movies.csv')



        # Print data info
        print(f"Ratings data shape: {filtered_ratings.shape}")
        print(f"Movies data shape: {filtered_movies.shape}")

        # Check for basic data requirements
        if len(filtered_ratings) == 0 or len(filtered_movies) == 0:
            raise ValueError("Empty dataset detected! Please provide valid data.")

        # Create user and movie towers
        user_model, user_movie_map = create_user_tower(filtered_ratings, filtered_movies)
        movie_model = create_movie_tower(filtered_movies)

        # Check if any users have liked movies
        if len(user_movie_map) == 0:
            raise ValueError("No users with liked movies found! Please check your ratings data.")

        # Prepare training data
        training_df = prepare_training_data(filtered_ratings, filtered_movies, user_movie_map)

        print(f"Created training data with {len(training_df)} samples")

        if len(training_df) == 0:
            raise ValueError("No training samples generated! Please check your data.")

        # Split data into train and validation sets
        from sklearn.model_selection import train_test_split
        train_df, val_df = train_test_split(training_df, test_size=0.2, random_state=42)

        print(f"Training set: {len(train_df)} samples")
        print(f"Validation set: {len(val_df)} samples")

        # Convert data to numpy arrays
        user_input_train = train_df['user_input'].values
        movie_input_train = train_df['movie_input'].values
        y_train = train_df['label'].values

        user_input_val = val_df['user_input'].values
        movie_input_val = val_df['movie_input'].values
        y_val = val_df['label'].values

        # Display sample data for debugging
        print("\nSample user input:", user_input_train[0])
        print("Sample movie input:", movie_input_train[0])
        print("Sample label:", y_train[0])

        # Create TensorFlow datasets
        train_dataset = tf.data.Dataset.from_tensor_slices(((user_input_train, movie_input_train), y_train))
        val_dataset = tf.data.Dataset.from_tensor_slices(((user_input_val, movie_input_val), y_val))

        # Shuffle, batch, and prefetch
        train_dataset = train_dataset.shuffle(10000).batch(32).prefetch(tf.data.experimental.AUTOTUNE)
        val_dataset = val_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

        # Train the model
        model, history = train_model(user_model, movie_model, train_dataset, val_dataset, similarity_metric='cosine')

        # Test with a few users
        test_users = [1,2]
        for user_id in test_users:
            print(f"\nRecommendations for User {user_id}:")
            recommendations = get_movie_recommendations(model, user_id, filtered_movies, user_movie_map, filtered_ratings, top_n=5)
            for i, (_, movie) in enumerate(recommendations.iterrows(), 1):
                print(f"{i}. {movie['title']} (Genres: {movie['genres']}) - Score: {movie['score']:.4f}")

    except Exception as e:
        print(f"Error encountered: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Ratings data shape: (40660, 4)
Movies data shape: (436, 4)
User Tower:


Movie Tower:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_movies['title_genre'] = filtered_movies['title'] + " " + filtered_movies['genres']


Created training data with 40653 samples
Training set: 32522 samples
Validation set: 8131 samples

Sample user input: Toy Story (1995) Twelve Monkeys (a.k.a. 12 Monkeys) (1995) Babe (1995) Clueless (1995) Usual Suspects, The (1995) Birdcage, The (1996) Apollo 13 (1995) Star Wars: Episode IV - A New Hope (1977) Star Trek: Generations (1994) Forrest Gump (1994) Lion King, The (1994) Mask, The (1994) Fugitive, The (1993) Philadelphia (1993) Schindler's List (1993) Blade Runner (1982) Terminator 2: Judgment Day (1991) Batman (1989) Silence of the Lambs, The (1991) Beauty and the Beast (1991) Godfather, The (1972) Rear Window (1954) Wizard of Oz, The (1939) 2001: A Space Odyssey (1968) Mary Poppins (1964) Sound of Music, The (1965) Die Hard (1988) E.T. the Extra-Terrestrial (1982) Monty Python and the Holy Grail (1975) Star Wars: Episode V - The Empire Strikes Back (1980) Princess Bride, The (1987) Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) Aliens (1986) 

In [None]:
    ratings_with_movies = filtered_ratings.merge(filtered_movies, on="movieId")
    ratings_with_movies[ratings_with_movies['userId']==2]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,title_genre
115,2,318,3.0,1445714835,"Shawshank Redemption, The (1994)",Crime|Drama,"Shawshank Redemption, The (1994) Crime|Drama"
116,2,1704,4.5,1445715228,Good Will Hunting (1997),Drama|Romance,Good Will Hunting (1997) Drama|Romance
117,2,3578,4.0,1445714885,Gladiator (2000),Action|Adventure|Drama,Gladiator (2000) Action|Adventure|Drama
118,2,6874,4.0,1445714952,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,Kill Bill: Vol. 1 (2003) Action|Crime|Thriller
119,2,48516,4.0,1445715064,"Departed, The (2006)",Crime|Drama|Thriller,"Departed, The (2006) Crime|Drama|Thriller"
120,2,58559,4.5,1445715141,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,"Dark Knight, The (2008) Action|Crime|Drama|IMAX"
121,2,68157,4.5,1445715154,Inglourious Basterds (2009),Action|Drama|War,Inglourious Basterds (2009) Action|Drama|War
122,2,71535,3.0,1445714974,Zombieland (2009),Action|Comedy|Horror,Zombieland (2009) Action|Comedy|Horror
123,2,74458,4.0,1445714926,Shutter Island (2010),Drama|Mystery|Thriller,Shutter Island (2010) Drama|Mystery|Thriller
124,2,79132,4.0,1445714841,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,Inception (2010) Action|Crime|Drama|Mystery|Sc...
