<a href="https://colab.research.google.com/github/gillilandim/Gilliard/blob/main/Prova_modulo_15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### Prova modulo 15

In [None]:
# Importação das bibliotecas e definição da função getDF()
#Essa célula é responsável por carregar as bibliotecas necessárias e a função que gera o conjunto de dados.

In [None]:
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [None]:
# Função para criar o DataFrame de exemplo
def getDF():
    dic__ = {"User_1":[np.nan, np.nan, np.nan, 1, 7, 2, 3, 8],
         "User_2":[9,10,2,2,6,5,3,8],
         "User_3":[4, 7, 9, 6,6,10,10,2],
         "User_4":[np.nan, 7, 9, 5, 5, 10, 9, 1],
         "User_5":[7.0,6.0,3.0,8.0,3,4.0,3.0, 2],
         "User_6":[np.nan, np.nan, 9, 9,6,8,9,np.nan],
         "User_7":[3,5,4,4,3,3,9,np.nan],
         "User_8":[10,10,10,10,2,2,2,2],
         "User_9":[9,9,np.nan,8,3,3,1,np.nan],
         "User_10":[9,8,10,9,3,4,2,1],
         "User_11":[4,4,3,3,9,9,8,10],
         "User_12":[2,2,4,1,8,10,10,9],
         "User_13":[1,4,1,3,7,10,7,8],
         "User_14":[3,3,2,1,1,10, np.nan,10],
         "User_15":[9,9,8,10,4,2,np.nan,1]
        }
    df = pd.DataFrame(dic__).T
    df.columns = ['Filme_'+str(int(i+1)) for i in range(8)]
    return df

In [None]:
# Carregando o DataFrame
df = getDF()
df.head()

In [None]:
# @title Filme_2

from matplotlib import pyplot as plt
df['Filme_2'].plot(kind='hist', bins=20, title='Filme_2')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
### Divisão dos dados em treino, validação e teste
# Aqui, os dados são divididos em três subconjuntos: treino, validação e teste.

In [None]:
# Função para dividir o conjunto de dados
def split_train_validation_test(df, qtd=2):
    train_df = df.copy()
    validation_df = pd.DataFrame(index=df.index, columns=df.columns)
    test_df = pd.DataFrame(index=df.index, columns=df.columns)

    for user in df.index:
        user_ratings = df.loc[user]
        rated_movies = user_ratings[user_ratings.notna()].index.tolist()

        if len(rated_movies) >= qtd:
            validation_movies = np.random.choice(rated_movies, size=1, replace=False)
            remaining_rated_movies = [movie for movie in rated_movies if movie not in validation_movies]
            test_movies = np.random.choice(remaining_rated_movies, size=1, replace=False)

            for movie in validation_movies:
                validation_df.loc[user, movie] = train_df.loc[user, movie]
                train_df.loc[user, movie] = np.nan

            for movie in test_movies:
                test_df.loc[user, movie] = train_df.loc[user, movie]
                train_df.loc[user, movie] = np.nan

    return train_df, validation_df, test_df




In [None]:
# Dividindo os dados
train_df, validation_df, test_df = split_train_validation_test(df, qtd=2)

train_df.head()


In [None]:
### Definição da classe de Fatoração Matricial
### Essa célula contém a implementação da fatoração matricial, que será usada para ajustar o modelo.

In [None]:
# Definindo a classe de Fatoração Matricial
class MatrixFactorization():

    def __init__(self, dataframe, K, steps, alpha, beta):
        self.df = dataframe
        self.K = K
        self.steps = steps
        self.alpha = alpha
        self.beta = beta

    def fit(self):
        t0 = time.time()

        R = self.df.values
        N, M = R.shape

        #inicio aleatorio
        P = np.random.rand(N,self.K)
        Q = np.random.rand(self.K,M)

        lista_erro_step = []

        #loop
        for step in range(self.steps):

            mse_total_step = 0
            #varrendo todas as entradas da matriz R
            for i in range(len(R)):
                for j in range(len(R[i])):
                    #validando se o valor associado está preenchido
                    if not np.isnan(R[i][j]):

                        #calculando o erro:
                        eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                        mse_total_step += (eij)**2
                        #alterando os valores
                        for k in range(self.K):
                            P[i][k] = P[i][k] + self.alpha * ( 2 * eij * Q[k][j] - self.beta * P[i][k])
                            Q[k][j] = Q[k][j] + self.alpha * ( 2 * eij * P[i][k] - self.beta * Q[k][j])

            lista_erro_step.append(mse_total_step)

        self.P = P
        self.Q = Q
        self.lista_erro_step = lista_erro_step
        t1 = time.time()
        #print("Fatoração concluída. Tempo aproximado:", int((t1-t0)/60)+1, 'minuto(s).')

    def predict(self):
        return self.P.dot(self.Q)


In [None]:
# Parâmetros a serem testados
param_grid = {
    'K': [2, 3, 4],
    'steps': [1000, 2000],
    'alpha': [0.001, 0.002],
    'beta': [0.01, 0.02]
}

best_rmse = float('inf')
best_params = {}

for K in param_grid['K']:
    for steps in param_grid['steps']:
        for alpha in param_grid['alpha']:
            for beta in param_grid['beta']:
                fat = MatrixFactorization(dataframe=train_df, K=K, steps=steps, alpha=alpha, beta=beta)
                fat.fit()
                predictions = pd.DataFrame(fat.predict(), columns=df.columns, index=df.index)

                rmse = 0
                count = 0
                for user in validation_df.index:
                    for movie in validation_df.columns:
                        if not np.isnan(validation_df.loc[user, movie]):
                            rmse += (validation_df.loc[user, movie] - predictions.loc[user, movie]) ** 2
                            count += 1

                if count > 0:
                    rmse = np.sqrt(rmse / count)

                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = {'K': K, 'steps': steps, 'alpha': alpha, 'beta': beta}

print("Melhores parâmetros:", best_params)
print("Melhor RMSE:", best_rmse)


In [None]:
###  Avaliação no conjunto de teste
### Aqui, usamos os melhores parâmetros encontrados para ajustar o modelo final e calcular o erro no conjunto de teste.

In [None]:
# Treinar o modelo com os melhores parâmetros no conjunto de treinamento completo
fat_final = MatrixFactorization(dataframe=df, K=best_params['K'], steps=best_params['steps'],
                                alpha=best_params['alpha'], beta=best_params['beta'])
fat_final.fit()
predictions_final = pd.DataFrame(fat_final.predict(), columns=df.columns, index=df.index)

In [None]:
# Avaliar no conjunto de teste
rmse_test = 0
count_test = 0
for user in test_df.index:
    for movie in test_df.columns:
        if not np.isnan(test_df.loc[user, movie]):
            rmse_test += (test_df.loc[user, movie] - predictions_final.loc[user, movie]) ** 2
            count_test += 1

if count_test > 0:
    rmse_test = np.sqrt(rmse_test / count_test)

print("RMSE no conjunto de teste:", rmse_test)
