<a href="https://colab.research.google.com/github/fenyxrainbow/film-recommendation-system/blob/main/filmrecommendationsystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.neighbors import NearestNeighbors

In [8]:
import pandas as pd
import requests
import zipfile
import io

# URL do dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Baixando o arquivo ZIP
response = requests.get(url)
if response.status_code == 200:  # Verifica se o download foi bem-sucedido
    # Extraindo os arquivos do ZIP
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("ml-latest-small")  # Extrai para uma pasta chamada "ml-latest-small"

    # Carregando os arquivos CSV
    df_ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
    df_movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

    # Visualizando os dados
    print("Primeiras linhas de ratings.csv:")
    print(df_ratings.head())

    print("\nPrimeiras linhas de movies.csv:")
    print(df_movies.head())
else:
    print("Erro ao baixar o arquivo. Verifique a URL ou sua conexão com a internet.")

Primeiras linhas de ratings.csv:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Primeiras linhas de movies.csv:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.neighbors import NearestNeighbors
import urllib.request
import zipfile
import os

# Carregando o dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
zip_file_path = "ml-latest-small.zip" # Define the zip file path

# Download the zip file
if not os.path.exists(zip_file_path):
    urllib.request.urlretrieve(url, zip_file_path)

# Extract the csv files from the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    ratings_file = zip_ref.extract('ml-latest-small/ratings.csv')
    movies_file = zip_ref.extract('ml-latest-small/movies.csv')

# Read the CSVs
df_ratings = pd.read_csv(ratings_file)
df_movies = pd.read_csv(movies_file)


# Visualizando os dados
print(df_ratings.head())
print(df_movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [9]:
# Removendo usuários e filmes com poucas interações
min_user_ratings = 50
min_movie_ratings = 50

filtered_users = df_ratings['userId'].value_counts() > min_user_ratings
filtered_users = filtered_users[filtered_users].index.tolist()

filtered_movies = df_ratings['movieId'].value_counts() > min_movie_ratings
filtered_movies = filtered_movies[filtered_movies].index.tolist()

df_ratings_filtered = df_ratings[(df_ratings['userId'].isin(filtered_users)) & (df_ratings['movieId'].isin(filtered_movies))]

In [10]:
# Normalizando as notas para uma escala de 0 a 5
df_ratings_filtered['rating'] = df_ratings_filtered['rating'] / 5.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ratings_filtered['rating'] = df_ratings_filtered['rating'] / 5.0


In [11]:
# Codificando gêneros dos filmes para Filtragem Baseada em Conteúdo
df_movies['genres'] = df_movies['genres'].apply(lambda x: x.split('|'))
df_movies['genres'] = df_movies['genres'].apply(lambda x: ' '.join(x))

In [12]:
# Criando uma matriz de usuários e filmes
user_movie_matrix = df_ratings_filtered.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [13]:
# Calculando a similaridade de cosseno entre filmes
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Mapeando índices de filmes
indices = pd.Series(df_movies.index, index=df_movies['title']).drop_duplicates()

In [14]:
def content_based_recommendation(title, cosine_sim=cosine_sim, df_movies=df_movies, indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recomendações
    movie_indices = [i[0] for i in sim_scores]
    return df_movies['title'].iloc[movie_indices]

# Exemplo de recomendação
print(content_based_recommendation('Toy Story (1995)'))

1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object


In [15]:
# Preparando os dados para o Surprise
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df_ratings_filtered[['userId', 'movieId', 'rating']], reader)

# Utilizando o algoritmo SVD
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1773  0.1802  0.1777  0.1803  0.1779  0.1787  0.0013  
MAE (testset)     0.1362  0.1378  0.1361  0.1376  0.1370  0.1370  0.0007  
Fit time          1.04    1.34    2.26    1.39    0.92    1.39    0.47    
Test time         0.09    0.17    0.17    0.10    0.11    0.13    0.03    


{'test_rmse': array([0.17729484, 0.18023407, 0.17770903, 0.18034284, 0.17794889]),
 'test_mae': array([0.13623677, 0.13783282, 0.13613305, 0.13760887, 0.13702984]),
 'fit_time': (1.0449409484863281,
  1.3391706943511963,
  2.262488842010498,
  1.3855838775634766,
  0.9242634773254395),
 'test_time': (0.0935525894165039,
  0.1687767505645752,
  0.16991209983825684,
  0.09722638130187988,
  0.10897016525268555)}

In [16]:
def collaborative_filtering_recommendation(user_id, algo=algo, df_ratings=df_ratings_filtered, df_movies=df_movies):
    user_movies = df_ratings[df_ratings['userId'] == user_id]['movieId'].unique()
    all_movies = df_ratings['movieId'].unique()
    movies_to_predict = np.setdiff1d(all_movies, user_movies)

    predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_movies = [pred.iid for pred in predictions[:10]]

    return df_movies[df_movies['movieId'].isin(top_movies)]['title']

# Exemplo de recomendação
print(collaborative_filtering_recommendation(1))

138             Die Hard: With a Vengeance (1995)
277              Shawshank Redemption, The (1994)
685                                Vertigo (1958)
690                     North by Northwest (1959)
819                   Fish Called Wanda, A (1988)
896        One Flew Over the Cuckoo's Nest (1975)
922                Godfather: Part II, The (1974)
1002    Butch Cassidy and the Sundance Kid (1969)
1645                     Untouchables, The (1987)
2036                        Eyes Wide Shut (1999)
Name: title, dtype: object


In [17]:
# Avaliação do modelo SVD
results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1783  0.1791  0.1773  0.1792  0.1790  0.1786  0.0007  
MAE (testset)     0.1360  0.1377  0.1365  0.1381  0.1376  0.1372  0.0008  
Fit time          0.43    0.41    0.42    0.41    0.41    0.42    0.01    
Test time         0.17    0.03    0.03    0.03    0.04    0.06    0.05    
{'test_rmse': array([0.17825042, 0.17909272, 0.17733162, 0.17917474, 0.17900869]), 'test_mae': array([0.1360368 , 0.13769367, 0.13649106, 0.13813975, 0.1375519 ]), 'fit_time': (0.4296126365661621, 0.414090633392334, 0.4249577522277832, 0.41079068183898926, 0.4057657718658447), 'test_time': (0.16728878021240234, 0.03129887580871582, 0.03198719024658203, 0.03174901008605957, 0.037305593490600586)}


In [18]:
import streamlit as st

st.title('Sistema de Recomendação de Filmes')

user_id = st.number_input('Digite o ID do usuário:', min_value=1, max_value=610, value=1)
if st.button('Recomendar'):
    recommendations = collaborative_filtering_recommendation(user_id)
    st.write('Recomendações:')
    st.write(recommendations)

2025-02-22 22:44:23.012 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-02-22 22:44:23.021 Session state does not function when running a script without `streamlit run`


In [22]:
import streamlit as st
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
import requests
import zipfile
import io

# URL do dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Baixando o arquivo ZIP
response = requests.get(url)
if response.status_code == 200:  # Verifica se o download foi bem-sucedido
    # Extraindo os arquivos do ZIP
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("ml-latest-small")  # Extrai para uma pasta chamada "ml-latest-small"

    # Carregando os arquivos CSV
    df_ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
    df_movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

    # Configurar o modelo SVD
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Interface do Streamlit
    st.title('Sistema de Recomendação de Filmes')

    user_id = st.number_input('Digite o ID do usuário:', min_value=1, max_value=610, value=1)
    if st.button('Recomendar'):
        user_movies = df_ratings[df_ratings['userId'] == user_id]['movieId'].unique()
        all_movies = df_ratings['movieId'].unique()
        movies_to_predict = np.setdiff1d(all_movies, user_movies)

        predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
        predictions.sort(key=lambda x: x.est, reverse=True)
        top_movies = [pred.iid for pred in predictions[:10]]

        st.write('Recomendações:')
        st.write(df_movies[df_movies['movieId'].isin(top_movies)]['title'])
else:
    st.error("Erro ao baixar o arquivo. Verifique a URL ou sua conexão com a internet.")



In [24]:
!pip install scikit-surprise



In [25]:
from surprise import SVD, Dataset, Reader  # A importação é a mesma

In [26]:
from surprise import SVD, Dataset, Reader
print("Surprise instalado com sucesso!")

Surprise instalado com sucesso!


In [27]:
import streamlit as st
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
import requests
import zipfile
import io

# URL do dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Baixando o arquivo ZIP
response = requests.get(url)
if response.status_code == 200:  # Verifica se o download foi bem-sucedido
    # Extraindo os arquivos do ZIP
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("ml-latest-small")  # Extrai para uma pasta chamada "ml-latest-small"

    # Carregando os arquivos CSV
    df_ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
    df_movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

    # Configurar o modelo SVD
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Interface do Streamlit
    st.title('Sistema de Recomendação de Filmes')

    user_id = st.number_input('Digite o ID do usuário:', min_value=1, max_value=610, value=1)
    if st.button('Recomendar'):
        user_movies = df_ratings[df_ratings['userId'] == user_id]['movieId'].unique()
        all_movies = df_ratings['movieId'].unique()
        movies_to_predict = np.setdiff1d(all_movies, user_movies)

        predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
        predictions.sort(key=lambda x: x.est, reverse=True)
        top_movies = [pred.iid for pred in predictions[:10]]

        st.write('Recomendações:')
        st.write(df_movies[df_movies['movieId'].isin(top_movies)]['title'])
else:
    st.error("Erro ao baixar o arquivo. Verifique a URL ou sua conexão com a internet.")



In [28]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [39]:
from pyngrok import ngrok

# Expor a interface do Streamlit na web
public_url = ngrok.connect(8501)
print(f"Acesse a interface do Streamlit em: {public_url}")

Acesse a interface do Streamlit em: NgrokTunnel: "https://9f94-35-227-37-121.ngrok-free.app" -> "http://localhost:8501"


In [43]:
from pyngrok import ngrok

# Kill any existing ngrok tunnels
ngrok.kill()

# Configurar o authtoken do ngrok
ngrok.set_auth_token("2tPyeXB7EPLrzfJ5AiqmGv7jRBu_3V1v9q4cfN93jpYxp4g8V")  # Substitua pelo seu authtoken

# Expor a interface do Streamlit na web
public_url = ngrok.connect(8501)
print(f"Acesse a interface do Streamlit em: {public_url}")

# Rodar o Streamlit
!streamlit run --server.port 8501 app.py

Acesse a interface do Streamlit em: NgrokTunnel: "https://e055-35-227-37-121.ngrok-free.app" -> "http://localhost:8501"
Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py


In [44]:
import streamlit as st
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
import requests
import zipfile
import io
from pyngrok import ngrok

# URL do dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Baixando o arquivo ZIP
response = requests.get(url)
if response.status_code == 200:  # Verifica se o download foi bem-sucedido
    # Extraindo os arquivos do ZIP
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("ml-latest-small")  # Extrai para uma pasta chamada "ml-latest-small"

    # Carregando os arquivos CSV
    df_ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
    df_movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

    # Configurar o modelo SVD
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Interface do Streamlit
    st.title('Sistema de Recomendação de Filmes')

    user_id = st.number_input('Digite o ID do usuário:', min_value=1, max_value=610, value=1)
    if st.button('Recomendar'):
        user_movies = df_ratings[df_ratings['userId'] == user_id]['movieId'].unique()
        all_movies = df_ratings['movieId'].unique()
        movies_to_predict = np.setdiff1d(all_movies, user_movies)

        predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
        predictions.sort(key=lambda x: x.est, reverse=True)
        top_movies = [pred.iid for pred in predictions[:10]]

        st.write('Recomendações:')
        st.write(df_movies[df_movies['movieId'].isin(top_movies)]['title'])
else:
    st.error("Erro ao baixar o arquivo. Verifique a URL ou sua conexão com a internet.")



In [45]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
import requests
import zipfile
import io

# URL do dataset MovieLens
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Baixando o arquivo ZIP
response = requests.get(url)
if response.status_code == 200:  # Verifica se o download foi bem-sucedido
    # Extraindo os arquivos do ZIP
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("ml-latest-small")  # Extrai para uma pasta chamada "ml-latest-small"

    # Carregando os arquivos CSV
    df_ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
    df_movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

    # Configurar o modelo SVD
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Interface do Streamlit
    st.title('Sistema de Recomendação de Filmes')

    user_id = st.number_input('Digite o ID do usuário:', min_value=1, max_value=610, value=1)
    if st.button('Recomendar'):
        user_movies = df_ratings[df_ratings['userId'] == user_id]['movieId'].unique()
        all_movies = df_ratings['movieId'].unique()
        movies_to_predict = np.setdiff1d(all_movies, user_movies)

        predictions = [algo.predict(user_id, movie_id) for movie_id in movies_to_predict]
        predictions.sort(key=lambda x: x.est, reverse=True)
        top_movies = [pred.iid for pred in predictions[:10]]

        st.write('Recomendações:')
        st.write(df_movies[df_movies['movieId'].isin(top_movies)]['title'])
else:
    st.error("Erro ao baixar o arquivo. Verifique a URL ou sua conexão com a internet.")

Writing app.py


In [46]:
from pyngrok import ngrok

# Configurar o authtoken do ngrok
ngrok.set_auth_token("2tPyeXB7EPLrzfJ5AiqmGv7jRBu_3V1v9q4cfN93jpYxp4g8V")  # Substitua pelo seu authtoken

# Expor a interface do Streamlit na web
public_url = ngrok.connect(8501)
print(f"Acesse a interface do Streamlit em: {public_url}")

# Rodar o Streamlit
!streamlit run --server.port 8501 app.py

Acesse a interface do Streamlit em: NgrokTunnel: "https://a6ab-35-227-37-121.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.227.37.121:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
