diff --git a/app.py b/app.py new file mode 100644 index 0000000..3bc8947 --- /dev/null +++ b/app.py @@ -0,0 +1,130 @@ +from flask import Flask, render_template, request, jsonify +import sqlite3 +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.neighbors import NearestNeighbors +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +import random +from joblib import Parallel, delayed + +app = Flask(__name__) + +# Connect to the SQLite database +conn = sqlite3.connect('movie_ratings_db.sqlite') + +# Query a subset of the data to speed up development/testing +# Adjust the LIMIT clause based on your dataset size +query = """ + SELECT r.userId, r.movieId, r.rating, m.title, m.genres + FROM ratings r + JOIN movies m ON r.movieId = m.movieId + LIMIT 10000 -- Adjust this limit based on your dataset size +""" +df = pd.read_sql_query(query, conn) + +# Assuming the genres column is in the format "Genre1|Genre2|Genre3" +# Convert genres into a list +df['genres'] = df['genres'].str.split('|') + +# Content-Based Filtering (using Genres) +df['genres_str'] = df['genres'].apply(lambda x: ' '.join(x)) +tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # Adjust max_features +tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str']) + +# Approximate Nearest Neighbors with NearestNeighbors +nn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine') +nn.fit(tfidf_matrix) + +# Collaborative Filtering (User-Item Interactions) +user_movie_ratings = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0) +movie_user_ratings = user_movie_ratings.T +movie_similarity = cosine_similarity(movie_user_ratings) + +# Function to get similar items based on NearestNeighbors +def get_similar_items_nn(movie_index): + distances, indices = nn.kneighbors(tfidf_matrix[movie_index]) + similar_items = df.iloc[indices[0]]['title'].tolist() + return similar_items + +# Function to get similar items based on Collaborative Filtering +def get_similar_items_cf(movie_title, top_n=5): + if movie_title not in user_movie_ratings.columns: + return [] # Return an empty list if the movie has no ratings + + movie_ratings = user_movie_ratings[movie_title].values.reshape(1, -1) + + # Calculate similarity scores using dot product + similar_scores = np.dot(movie_ratings, movie_similarity) + + # Extract the similarity scores for the given movie + similarity_scores_for_movie = similar_scores.flatten() + + # Create a DataFrame with movie titles and similarity scores + similar_movies_df = pd.DataFrame({'movie': movie_user_ratings.index, 'similarity': similarity_scores_for_movie}) + + # Sort by similarity and get the top N + similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False).head(top_n) + + return similar_movies_df['movie'].tolist() + +# Function to get hybrid recommendations (combining CF and CB) +def get_hybrid_recommendations(selected_movies, top_n=5): + cf_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_cf)(movie_title, top_n=top_n) for movie_title in selected_movies) + cf_recommendations = [item for sublist in cf_recommendations for item in sublist] + + cb_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_nn)(df[df['title'] == movie_title].index[0]) for movie_title in selected_movies) + cb_recommendations = [item for sublist in cb_recommendations for item in sublist] + + genre_filter = set().union(*(tuple(genre) for movie_title in selected_movies for genre in df[df['title'] == movie_title]['genres'])) + cb_recommendations_filtered = [movie for movie in cb_recommendations if any(set(genre) & genre_filter for genre in df[df['title'] == movie]['genres'])] + + hybrid_recommendations = list(set(cf_recommendations + cb_recommendations_filtered))[:top_n] + + return hybrid_recommendations + +# Initialize variables to keep track of liked movies +liked_movies = [] +max_likes = 5 # Set the maximum number of liked movies + +# Function to get random movie for rating +def get_random_movie(): + return df.sample(1).iloc[0] + +# Routes +@app.route('/') +def index(): + global liked_movies + if len(liked_movies) < max_likes: + random_movie = get_random_movie() + return render_template('index.html', random_movie=random_movie) + else: + return "You have reached the maximum number of liked movies. Check your recommendations!" + +@app.route('/rate_movie', methods=['POST']) +def rate_movie(): + global liked_movies + movie_id = request.form['movie_id'] + feedback = request.form['feedback'] # Change 'rating' to 'feedback' + + # Here you can save the user's feedback to the database if needed + # For simplicity, let's just print the movie ID and feedback + print(f"User provided feedback for Movie ID {movie_id}: {feedback}") + + # Add the movie to the liked movies list if feedback is 'like' + if feedback == 'like': + liked_movies.append(movie_id) + + # Get another random movie for the user to provide feedback + if len(liked_movies) < max_likes: + random_movie = get_random_movie() + return jsonify({'random_movie': random_movie.to_dict()}) + + else: + # Provide recommendations when the user reaches the maximum liked movies + hybrid_recommendations = get_hybrid_recommendations(liked_movies, top_n=5) + return jsonify({'recommendations': hybrid_recommendations}) + +if __name__ == '__main__': + conn.close() + app.run(debug=True) \ No newline at end of file diff --git a/app2.py b/app2.py new file mode 100644 index 0000000..b950f5e --- /dev/null +++ b/app2.py @@ -0,0 +1,184 @@ +import warnings +warnings.filterwarnings("ignore") + +import numpy as np +import datetime as dt +import os + +import sqlite3 + +import sqlalchemy as db +from sqlalchemy.ext.automap import automap_base +from sqlalchemy.orm import Session +from sqlalchemy import create_engine, func, inspect + +from flask import Flask, jsonify, render_template, request, session + +import pandas as pd + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.neighbors import NearestNeighbors +import numpy as np +import random +from joblib import Parallel, delayed + +engine = create_engine("sqlite:///movie_ratings_db.sqlite") + +conn = sqlite3.connect('movie_ratings_db.sqlite') + +# Query a subset of the data to speed up development/testing +# Adjust the LIMIT clause based on your dataset size +query = """ + SELECT r.userId, r.movieId, r.rating, m.title, m.genres + FROM ratings r + JOIN movies m ON r.movieId = m.movieId + LIMIT 10000 -- Adjust this limit based on your dataset size +""" +df = pd.read_sql_query(query, conn) + +# Assuming the genres column is in the format "Genre1|Genre2|Genre3" +# Convert genres into a list +df['genres'] = df['genres'].str.split('|') + +# Content-Based Filtering (using Genres) +df['genres_str'] = df['genres'].apply(lambda x: ' '.join(x)) +tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # Adjust max_features +tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str']) + +# Approximate Nearest Neighbors with NearestNeighbors +nn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine') +nn.fit(tfidf_matrix) + +app = Flask(__name__) +app.secret_key = os.urandom(24) + +@app.route("/") +def welcome(): + return render_template("index2.html") + +@app.route("/genre") +def genre(): + return render_template("reregenre.html") + +@app.route("/title") +def title(): + return render_template("reretitle.html") + +# Counter to keep track of the number of movie selections +selection_counter = 0 + +@app.route("/recommendation_engine", methods=['GET', 'POST']) +def rec_engine(): + global result + global selection_counter + + if 'like_counter' not in session: + session['like_counter'] = 0 + + if request.method == 'POST': + action = request.form.get('action') + if action in ['like', 'dislike', 'skip', 'unsure']: + # Handle the user's action (like, dislike, skip, unsure) + # Update user preferences or store data accordingly + + if action == 'like': + session['like_counter'] += 1 + + # Increment the selection counter + selection_counter += 1 + + # If the user has liked 5 movies, generate recommendations + if session['like_counter'] == 5: + # Call the function to start the recommendation process + redraw_recommendations() + + # Reset the like counter + session['like_counter'] = 0 + + if request.method == 'POST': + action = request.form.get('action') + if action in ['like', 'dislike', 'skip', 'unsure']: + # Handle the user's action (like, dislike, skip, unsure) + # Update user preferences or store data accordingly + pass + + if request.method == 'POST': + selected_movies = request.form.getlist('selected_movies') + + # Collaborative Filtering (User-Item Interactions) + user_movie_ratings = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0) + movie_user_ratings = user_movie_ratings.T + movie_similarity = cosine_similarity(movie_user_ratings) + + # Function to get similar items based on NearestNeighbors + def get_similar_items_nn(movie_index): + distances, indices = nn.kneighbors(tfidf_matrix[movie_index]) + similar_items = df.iloc[indices[0]]['title'].tolist() + return similar_items + + # Function to get similar items based on Collaborative Filtering + def get_similar_items_cf(movie_title, top_n=5): + movie_ratings = user_movie_ratings[movie_title].values.reshape(1, -1) + + # Calculate similarity scores using cosine_similarity + similar_scores = cosine_similarity(movie_ratings, movie_user_ratings) + + # Extract the similarity scores for the given movie + similarity_scores_for_movie = similar_scores.flatten() + + # Create a DataFrame with movie titles and similarity scores + similar_movies_df = pd.DataFrame({'movie': movie_user_ratings.index, 'similarity': similarity_scores_for_movie}) + + # Sort by similarity and get the top N + similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False).head(top_n) + + return similar_movies_df['movie'].tolist() + + # Function to get hybrid recommendations (combining CF and CB) + def get_hybrid_recommendations(selected_movies, top_n=5): + cf_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_cf)(movie_title, top_n=top_n) for movie_title in selected_movies) + cf_recommendations = [item for sublist in cf_recommendations for item in sublist] + + cb_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_nn)(df[df['title'] == movie_title].index[0]) for movie_title in selected_movies) + cb_recommendations = [item for sublist in cb_recommendations for item in sublist] + + genre_filter = set().union(*(tuple(genre) for movie_title in selected_movies for genre in df[df['title'] == movie_title]['genres'])) + cb_recommendations_filtered = [movie for movie in cb_recommendations if any(set(genre) & genre_filter for genre in df[df['title'] == movie]['genres'])] + + hybrid_recommendations = list(set(cf_recommendations + cb_recommendations_filtered))[:top_n] + + return hybrid_recommendations + + # Function to allow the user to choose movies from a random list + def choose_movies(): + random_movies = random.sample(df['title'].tolist(), 10) + return random_movies + + # Function to allow the user to redraw recommendations + def redraw_recommendations(): + while True: + selected_movies = choose_movies() + hybrid_recommendations = get_hybrid_recommendations(selected_movies, top_n=5) + global result + result = hybrid_recommendations.copy() + print(f'Hybrid recommendations based on user-selected movies:\n{hybrid_recommendations}') + + redraw = input("Do you want to redraw recommendations? Enter 'yes' or 'no': ").lower() + if redraw != 'yes': + break + + print(result) + print(type(hybrid_recommendations)) + print(hybrid_recommendations) + + # If the user has not liked 5 movies, continue choosing movies + if session['like_counter'] < 5: + random_movie = df['title'].sample().iloc[0] + return render_template("rec_engine_interactive.html", movie_title=random_movie, like_counter=session['like_counter']) + + # If the user has liked 5 movies, display recommendations + return render_template("rec_engine_interactive.html", movie_title=result[0], like_counter=session['like_counter']) + +if __name__ == "__main__": + app.run(debug=True) diff --git a/movie_ML.ipynb b/movie_ML.ipynb index 5c9b497..4768eaa 100644 --- a/movie_ML.ipynb +++ b/movie_ML.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 12, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -13,30 +13,40 @@ "from sklearn.neighbors import NearestNeighbors\n", "import numpy as np\n", "import random\n", - "from joblib import Parallel, delayed" + "from joblib import Parallel, delayed\n", + "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Connect to the SQLite database\n", - "conn = sqlite3.connect('movie_ratings_db.sqlite')\n", - "\n", - "# Query the data from the ratings table and join with the movies table\n", + "conn = sqlite3.connect('movie_ratings_db.sqlite')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Query a subset of the data to speed up development/testing\n", + "# Adjust the LIMIT clause based on your dataset size\n", "query = \"\"\"\n", " SELECT r.userId, r.movieId, r.rating, m.title, m.genres\n", " FROM ratings r\n", " JOIN movies m ON r.movieId = m.movieId\n", + " LIMIT 10000 -- Adjust this limit based on your dataset size\n", "\"\"\"\n", "df = pd.read_sql_query(query, conn)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -47,31 +57,31 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Content-Based Filtering (using Genres)\n", "df['genres_str'] = df['genres'].apply(lambda x: ' '.join(x))\n", - "tfidf_vectorizer = TfidfVectorizer(stop_words='english')\n", - "tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str'])\n" + "tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # Adjust max_features\n", + "tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str'])" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
NearestNeighbors(metric='cosine')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
NearestNeighbors(metric='cosine')
NearestNeighbors(metric='cosine')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
NearestNeighbors(metric='cosine')
{{ movie_title }}
+Likes: {{ like_counter }}
+ + + + + + +