In [1]:
import pandas as pd
import os 

## Data Processing 

In [2]:
# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

# Load the user data
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, 
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

genres = [
    "Unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
]

In [3]:
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [4]:
user_interactions = ratings.groupby('user_id').size()
user_indexes = user_interactions[user_interactions > 200].index 
ratings = ratings[ratings.user_id.isin(user_indexes)]
ratings.rating = [1 if rating > 2.5 else 0 for rating in ratings.rating]
ratings

Unnamed: 0,user_id,item_id,rating,timestamp
3,244,51,0,880606923
8,305,451,1,886324817
9,6,86,1,883603013
10,62,257,0,879372434
11,286,1014,1,879781125
...,...,...,...,...
99994,378,78,1,880056976
99995,880,476,1,880175444
99996,716,204,1,879795543
99997,276,1090,0,874795795


In [5]:
movies = movies[movies.index.get_level_values(0).isin(ratings.item_id)]
movies = movies.dropna(axis=1)
movies.columns = genres
movies["Title"] = movies.index.get_level_values(1)
movies

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Title
1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Toy Story (1995)
2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,GoldenEye (1995)
3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,Four Rooms (1995)
4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,Get Shorty (1995)
5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,Copycat (1995)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,"Sunchaser, The (1996)",25-Oct-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"Sunchaser, The (1996)"
1676,"War at Home, The (1996)",01-Jan-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"War at Home, The (1996)"
1677,Sweet Nothing (1995),20-Sep-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,Sweet Nothing (1995)
1681,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,You So Crazy (1994)


## Recommendation Evaluation

1- Add more ( preferablly more complex ) recommendation strategies and models

2- Extend analysis to include recommendation metrics that offer a different perspective ( Think about if accuracy is the most important thing for us ? )

In [6]:
from sklearn.model_selection import train_test_split
import random 

test_user = random.choice(ratings.user_id.unique())

def get_data_for_user(ratings, movies, user_id):    
    temp_ratings = ratings[ratings.user_id == user_id]
    temp_movies = movies[movies.index.get_level_values(0).isin(temp_ratings.item_id)]
    temp_data = temp_movies.merge(temp_ratings, left_on=temp_movies.index.get_level_values(0), right_on='item_id').drop(['item_id', 'user_id', 'timestamp'], axis=1) 
    X = temp_data.drop('rating', axis=1)
    y = temp_data['rating']
    return train_test_split(X, y, test_size=0.3, random_state=42)

## Feedback 

In [34]:
from langchain_openai import ChatOpenAI
from feedback import *
from langchain_core.runnables import RunnableLambda

In [7]:
with open("openai_api_key.txt") as f:
    api_key = f.read().strip()

os.environ["OPENAI_API_KEY"] = api_key

In [9]:
movie_genres = {}

for title in movies.Title:
    genres_list = []
    for genre in genres:
        if movies[movies.Title == title][genre].values[0] == 1:
            genres_list.append(genre)
    movie_genres[title] = genres_list

movie_genres["Toy Story (1995)"]

['Animation', "Children's", 'Comedy']

In [10]:
test_user = random.choice(ratings.user_id.unique())
X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, test_user)

In [11]:
enjoyed_movies = X_train[y_train == 1].Title
disliked_movies = X_train[y_train == 0].Title
recommended_movie = X_test.sample(1).Title.values[0]

In [16]:
model = ChatOpenAI()
llm = model.with_structured_output(FeatureBasedFeedback)

In [32]:
# Define the RunnableLambda for feedback generation
feedback_chain = RunnableLambda(
    lambda input: get_feedback_prompt(
        input["enjoyed_movies"],
        input["disliked_movies"],
        input["movie_genres"],
        input["recommended_movie"]
    )
)

# Example Inputs
input_data = {
    "enjoyed_movies": enjoyed_movies,
    "disliked_movies": disliked_movies,
    "movie_genres": movie_genres,
    "recommended_movie": recommended_movie
}

# Generate the prompt using RunnableLambda
prompt = feedback_chain.invoke(input_data)

# Send the prompt to the LLM
response = llm.invoke(prompt)

In [33]:
response

FeatureBasedFeedback(binary_rating=True, positive_genres=['Crime'], negative_genres=['Thriller'], feedback='The Usual Suspects aligns with my preferences because I like Crime genres, even though I am not a fan of Thriller movies.')

In [39]:
y_test.loc[X_test.Title == recommended_movie]

7    1
Name: rating, dtype: int64