In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
# Load ratings data
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/movielens/u.data', sep='\t', names=column_names)

# Load movie titles
movie_titles = pd.read_csv('../data/movielens/u.item', sep='|', encoding='latin-1',
                           usecols=[0, 1], names=['item_id', 'title'])

# Merge the datasets
data = pd.merge(ratings, movie_titles, on='item_id')


In [3]:
data['interaction'] = data['rating'].apply(lambda x: 1 if x >= 4 else 0)

In [4]:
user_ids = data['user_id'].unique().tolist()
item_ids = data['item_id'].unique().tolist()

user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(item_ids)}

data['user_idx'] = data['user_id'].map(user_id_to_idx)
data['item_idx'] = data['item_id'].map(item_id_to_idx)


In [5]:
train_data, test_data = train_test_split(
    data[['user_idx', 'item_idx', 'interaction']], test_size=0.2, random_state=42)


In [6]:
num_users = len(user_ids)
num_items = len(item_ids)
embedding_size = 32  # Adjustable based on experimentation


In [7]:
# User input and embedding
user_input = keras.Input(shape=(1,), name='user_input')
user_embedding = layers.Embedding(num_users, embedding_size, name='user_embedding')(user_input)
user_embedding = layers.Flatten()(user_embedding)

# Item input and embedding
item_input = keras.Input(shape=(1,), name='item_input')
item_embedding = layers.Embedding(num_items, embedding_size, name='item_embedding')(item_input)
item_embedding = layers.Flatten()(item_embedding)

# Concatenate user and item embeddings
concat = layers.Concatenate()([user_embedding, item_embedding])

# MLP layers
dense = layers.Dense(128, activation='relu')(concat)
dense = layers.Dense(64, activation='relu')(dense)
dense = layers.Dense(32, activation='relu')(dense)

# Output layer
output = layers.Dense(1, activation='sigmoid')(dense)

# Define the model
ncf_model = keras.Model(inputs=[user_input, item_input], outputs=output)

# Compile the model
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
def create_negative_samples(df, num_negatives=4):
    import random
    users, items, labels = [], [], []
    user_item_set = set(zip(df['user_idx'], df['item_idx']))
    all_items = set(range(num_items))
    for (u, i) in user_item_set:
        users.append(u)
        items.append(i)
        labels.append(1)
        for _ in range(num_negatives):
            negative_item = random.choice(list(all_items - set([i])))
            users.append(u)
            items.append(negative_item)
            labels.append(0)
    return pd.DataFrame({'user_idx': users, 'item_idx': items, 'interaction': labels})

# Create new train and test datasets with negative samples
train_data_neg = create_negative_samples(train_data, num_negatives=4)
test_data_neg = create_negative_samples(test_data, num_negatives=4)


In [8]:
from keras_tuner import HyperModel
from keras_tuner import RandomSearch

def build_model(hp):
    # User input and embedding
    user_input = keras.Input(shape=(1,), name='user_input')
    user_embedding = layers.Embedding(num_users, hp.Int('embedding_size', min_value=16, max_value=64, step=16))(user_input)
    user_embedding = layers.Flatten()(user_embedding)

    # Item input and embedding
    item_input = keras.Input(shape=(1,), name='item_input')
    item_embedding = layers.Embedding(num_items, hp.Int('embedding_size', min_value=16, max_value=64, step=16))(item_input)
    item_embedding = layers.Flatten()(item_embedding)

    # Concatenate user and item embeddings
    concat = layers.Concatenate()([user_embedding, item_embedding])

    # MLP layers with hyperparameters for dense units and dropout rate
    dense = layers.Dense(hp.Int('units_1', min_value=64, max_value=256, step=64), activation='relu')(concat)
    dense = layers.Dropout(hp.Float('dropout_1', 0.1, 0.5, step=0.1))(dense)
    dense = layers.Dense(hp.Int('units_2', min_value=32, max_value=128, step=32), activation='relu')(dense)
    dense = layers.Dropout(hp.Float('dropout_2', 0.1, 0.5, step=0.1))(dense)

    # Output layer
    output = layers.Dense(1, activation='sigmoid')(dense)

    model = keras.Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [9]:
# Prepare inputs for training and testing
train_user = train_data['user_idx'].values
train_item = train_data['item_idx'].values
train_label = train_data['interaction'].values

test_user = test_data['user_idx'].values
test_item = test_data['item_idx'].values
test_label = test_data['interaction'].values


In [10]:
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='movie_recommender'
)

# Run the tuner search
tuner.search(
    [train_user, train_item],
    train_label,
    validation_data=([test_user, test_item], test_label),
    batch_size=256,
    epochs=20,
    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)


Reloading Tuner from tuner_dir\movie_recommender\tuner0.json


In [11]:
# Retrieve the best model from the tuner
best_model = tuner.get_best_models(num_models=1)[0]

# Train the best model with early stopping
history = best_model.fit(
    [train_user, train_item],
    train_label,
    batch_size=256,
    epochs=50,
    validation_data=([test_user, test_item], test_label),
    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)



Epoch 1/50


  saveable.load_own_variables(weights_store.get(inner_path))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7428 - loss: 0.5142 - val_accuracy: 0.7096 - val_loss: 0.5614
Epoch 2/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7505 - loss: 0.5058 - val_accuracy: 0.7128 - val_loss: 0.5628
Epoch 3/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7585 - loss: 0.4946 - val_accuracy: 0.7127 - val_loss: 0.5629
Epoch 4/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7608 - loss: 0.4906 - val_accuracy: 0.7113 - val_loss: 0.5668


In [12]:
loss, accuracy = best_model.evaluate([test_user, test_item], test_label)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 508us/step - accuracy: 0.7102 - loss: 0.5662
Test Loss: 0.5668, Test Accuracy: 0.7113


In [13]:
def get_liked_movies(user_id, num_movies=10):
    user_data = data[(data['user_id'] == user_id) & (data['rating'] >= 4)]
    liked_movies = user_data.sample(n=min(num_movies, len(user_data)))['title'].tolist()
    return liked_movies

def get_disliked_movies(user_id, num_movies=10):
    user_data = data[(data['user_id'] == user_id) & (data['rating'] <= 2)]
    disliked_movies = user_data.sample(n=min(num_movies, len(user_data)))['title'].tolist()
    return disliked_movies


In [14]:
def recommend_movies(user_id, num_recommendations=10):
    user_idx = user_id_to_idx.get(user_id)
    if user_idx is None:
        print("User ID not found.")
        return []
    
    # Items the user has interacted with
    user_data = data[data['user_idx'] == user_idx]
    interacted_items = set(user_data['item_idx'].tolist())
    
    # Items not yet interacted with
    all_items = set(range(num_items))
    items_to_predict = list(all_items - interacted_items)
    
    # Predict interaction scores
    user_array = np.full(len(items_to_predict), user_idx)
    item_array = np.array(items_to_predict)
    
    predictions = best_model.predict([user_array, item_array], batch_size=1024).flatten()
    
    # Get top N items
    top_indices = predictions.argsort()[-num_recommendations:][::-1]
    recommended_item_idxs = [items_to_predict[i] for i in top_indices]
    
    # Map item indices to titles
    recommended_item_ids = [item_ids[idx] for idx in recommended_item_idxs]
    recommended_titles = movie_titles[movie_titles['item_id'].isin(recommended_item_ids)]['title'].tolist()
    
    return recommended_titles


In [15]:
import random

# Choose a random user ID from the dataset
random_user_id = random.choice(user_ids)

# Get liked, disliked, and recommended movies
liked_movies = get_liked_movies(random_user_id, num_movies=10)
disliked_movies = get_disliked_movies(random_user_id, num_movies=10)
recommended_movies = recommend_movies(random_user_id, num_recommendations=10)

# Display the final report
print(f"Final Report for User {random_user_id}:")
print("\nMovies They Liked:")
for idx, title in enumerate(liked_movies, 1):
    print(f"{idx}. {title}")

print("\nMovies They Didn't Like:")
for idx, title in enumerate(disliked_movies, 1):
    print(f"{idx}. {title}")

print("\nRecommended Movies They Might Like:")
for idx, title in enumerate(recommended_movies, 1):
    print(f"{idx}. {title}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Final Report for User 376:

Movies They Liked:
1. Graduate, The (1967)
2. Full Monty, The (1997)
3. One Flew Over the Cuckoo's Nest (1975)
4. Seven (Se7en) (1995)
5. Monty Python's Life of Brian (1979)
6. Sense and Sensibility (1995)
7. Rear Window (1954)
8. Sling Blade (1996)
9. Enchanted April (1991)
10. Beautiful Girls (1996)

Movies They Didn't Like:

Recommended Movies They Might Like:
1. Whole Wide World, The (1996)
2. Crossfire (1947)
3. When We Were Kings (1996)
4. Fresh (1994)
5. Faust (1994)
6. Mina Tannenbaum (1994)
7. Anna (1996)
8. Pather Panchali (1955)
9. Cérémonie, La (1995)
10. Bitter Sugar (Azucar Amargo) (1996)


In [16]:
# Save the best model
best_model.save('../app/models/ncf_model.h5')




In [21]:
import pickle

# Save user_id_to_idx mapping
with open('../app/models/user_id_to_idx.pkl', 'wb') as f:
    pickle.dump(user_id_to_idx, f)

# Save item_ids list (index corresponds to item_idx)
with open('../app/models/item_ids.pkl', 'wb') as f:
    pickle.dump(item_ids, f)

# Save item_id_to_title mapping
item_id_to_title = dict(zip(movie_titles['item_id'], movie_titles['title']))
with open('../app/models/item_id_to_title.pkl', 'wb') as f:
    pickle.dump(item_id_to_title, f)

# Save data DataFrame (if needed for liked/disliked movies)
data.to_pickle('../app/models/data.pkl')


In [22]:
# Save the list of user IDs
with open('../app/models/user_ids.pkl', 'wb') as f:
    pickle.dump(user_ids, f)


In [26]:
# Print all layer names in the ncf_model to verify available layers
for layer in ncf_model.layers:
    print(layer.name)


user_input
item_input
embedding
embedding_1
flatten
flatten_1
concatenate
dense
dropout
dense_1
dropout_1
dense_2


In [35]:
import os
import requests

# Set up the API URL and API key (replace with your actual API key)
API_KEY = os.getenv("MOVIE_API_KEY", "16138e16f189133c632250017548d0db")
BASE_URL = "https://api.themoviedb.org/3"

# Define a function to get movie poster URLs for a given movie title
def get_movie_data(movie_title):
    search_url = f"{BASE_URL}/search/movie"
    params = {
        "api_key": API_KEY,
        "query": movie_title
    }
    response = requests.get(search_url, params=params)
    
    try:
        data = response.json()
        # Print the data to inspect the structure
        print(f"Response data for '{movie_title}':", data)

        # Check if any results were returned
        if "results" in data and data["results"]:
            movie_info = data["results"][0]  # Take the first result
            poster_path = movie_info.get("poster_path")
            if poster_path:
                poster_url = f"https://image.tmdb.org/t/p/w500{poster_path}"
            else:
                poster_url = "No poster found"
            return {"title": movie_info["title"], "poster_url": poster_url}
        else:
            return {"title": movie_title, "poster_url": "No poster found or error in response"}
    
    except Exception as e:
        print(f"Error processing movie '{movie_title}':", e)
        return {"title": movie_title, "poster_url": "Error occurred"}

# Test with a sample list of movie titles
movie_titles = ["Inception (2010)"]
movie_data = [get_movie_data(title) for title in movie_titles]

# Display the results
for movie in movie_data:
    print(f"Title: {movie['title']}, Poster URL: {movie['poster_url']}")


Response data for 'Inception (2010)': {'page': 1, 'results': [], 'total_pages': 1, 'total_results': 0}
Title: Inception (2010), Poster URL: No poster found or error in response


In [29]:
!pip install requests



