In [1]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import numpy as np


In [2]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
movies_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

users_path = '../datasets/ml-100k/u.user'
movies_path = '../datasets/ml-100k/u.item'
ratings_path = '../datasets/ml-100k/u.data'

In [3]:

users = pd.read_csv(users_path, sep = '|', names = users_cols,encoding = 'latin-1')
movie = pd.read_csv(movies_path, sep = '|', names = movies_cols, encoding = 'latin-1')
ratings = pd.read_csv(ratings_path, sep = '\t', names = ratings_cols , encoding = 'latin-1')

In [30]:
movie.head(5)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]

In [5]:
data_matrix = ratings.pivot_table(values="rating",index="user_id",columns="movie_id",fill_value= 0)
userMovie_ratings = data_matrix
movieUser_ratings = data_matrix.T



In [6]:
print(userMovie_ratings)

movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1            5     3     4     3     3     5     4     1     5     3  ...   
2            4     0     0     0     0     0     0     0     0     2  ...   
3            0     0     0     0     0     0     0     0     0     0  ...   
4            0     0     0     0     0     0     0     0     0     0  ...   
5            4     3     0     0     0     0     0     0     0     0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939          0     0     0     0     0     0     0     0     5     0  ...   
940          0     0     0     2     0     0     4     5     3     0  ...   
941          5     0     0     0     0     0     4     0     0     0  ...   
942          0     0     0     0     0     0     0     0     0     0  ...   
943          0     5     0     0     0     0     0     0     3     0  ...   

In [7]:
print(movieUser_ratings)

user_id   1    2    3    4    5    6    7    8    9    10   ...  934  935  \
movie_id                                                    ...             
1           5    4    0    0    4    4    0    0    0    4  ...    2    3   
2           3    0    0    0    3    0    0    0    0    0  ...    4    0   
3           4    0    0    0    0    0    0    0    0    0  ...    0    0   
4           3    0    0    0    0    0    5    0    0    4  ...    5    0   
5           3    0    0    0    0    0    0    0    0    0  ...    0    0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
1678        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1679        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1680        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1681        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1682        0    0    0    0    0    0    0    0    0    0  ...    0    0   

In [8]:
print(f"n_users:{n_users} \nn_movies:{n_movies}")
print(f"userMovie_ratings:(f:{userMovie_ratings.shape[0]} c:{userMovie_ratings.shape[1]})")
print(f"movieUser_ratings:(f:{movieUser_ratings.shape[0]} c:{movieUser_ratings.shape[1]})")

n_users:943 
n_movies:1682
userMovie_ratings:(f:943 c:1682)
movieUser_ratings:(f:1682 c:943)


In [22]:

num_sv = 10
SVD = TruncatedSVD(n_components=num_sv, random_state=42)

SVD.fit(movieUser_ratings)
print(movieUser_ratings)
print('Cantidad de información simplificada con los primeros %d vectores singulares:' % num_sv)
print('%.1f%%' % (
    100 * (1 - (SVD.singular_values_[0:num_sv]).sum() / (SVD.singular_values_).sum())))

user_id   1    2    3    4    5    6    7    8    9    10   ...  934  935  \
movie_id                                                    ...             
1           5    4    0    0    4    4    0    0    0    4  ...    2    3   
2           3    0    0    0    3    0    0    0    0    0  ...    4    0   
3           4    0    0    0    0    0    0    0    0    0  ...    0    0   
4           3    0    0    0    0    0    5    0    0    4  ...    5    0   
5           3    0    0    0    0    0    0    0    0    0  ...    0    0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
1678        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1679        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1680        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1681        0    0    0    0    0    0    0    0    0    0  ...    0    0   
1682        0    0    0    0    0    0    0    0    0    0  ...    0    0   

In [10]:
num_sv = 10
SVD = TruncatedSVD(n_components=num_sv, random_state=42)

resultant_matrix = SVD.fit_transform(movieUser_ratings)
resultant_matrix.shape



(1682, 10)

In [11]:
# Pearson correlation matrix
corrMtx = np.corrcoef(resultant_matrix)
 
corrMtx[0:5,0:5]



array([[1.        , 0.84269117, 0.83022272, 0.82197731, 0.65316188],
       [0.84269117, 1.        , 0.68689277, 0.89499142, 0.72415104],
       [0.83022272, 0.68689277, 1.        , 0.75913436, 0.74550952],
       [0.82197731, 0.89499142, 0.75913436, 1.        , 0.74512727],
       [0.65316188, 0.72415104, 0.74550952, 0.74512727, 1.        ]])

In [12]:
# Look for Taco Bell index
liked = 1

names = userMovie_ratings.columns 
names_list = list(names)
id_liked = names_list.index(liked)

id_liked

0

In [13]:
corr_recom = corrMtx[id_liked]

print('Recomendaciones: ')
# select names with a correletion between .97 and .99
idMovies = list(names[(corr_recom > 0.90) & (corr_recom < 0.99)])

movie_row = movie[movie['movie id'] == 1]

# Obtener el nombre de la película
movie_title = movie_row['movie title'].iloc[0]

print("Nombre de la película con ID 1:", movie_title)
filtered_movies = movie[movie['movie id'].isin(idMovies)]
movie_titles = filtered_movies['movie title'].tolist()

print("Nombres de las películas con IDs especificados:")
print(movie_titles)

Recomendaciones: 
Nombre de la película con ID 1: Toy Story (1995)
Nombres de las películas con IDs especificados:
['Rumble in the Bronx (1995)', 'Star Wars (1977)', 'Rock, The (1996)', 'Independence Day (ID4) (1996)', 'Willy Wonka and the Chocolate Factory (1971)', 'Return of the Jedi (1983)', 'Star Trek: First Contact (1996)', 'Mars Attacks! (1996)', 'Lost World: Jurassic Park, The (1997)', 'Men in Black (1997)', 'Fierce Creatures (1997)', 'Mission: Impossible (1996)', "Jackie Chan's First Strike (1996)", 'Broken Arrow (1996)', 'Space Jam (1996)', 'Live Nude Girls (1995)']


In [14]:
# Suponiendo que 'movie' es tu DataFrame de películas y 'names' es tu lista de nombres de películas
corr_recom = corrMtx[29]

print('Recomendaciones: ')

# Seleccionar nombres con una correlación entre .97 y .99
idMovies = list(names[(corr_recom > 0.95) & (corr_recom < 0.99)])

# Filtrar películas basadas en IDs
filtered_movies = movie[movie['movie id'].isin(idMovies)]

# Imprimir películas en el formato "movie id: movie title"
for index, row in filtered_movies.iterrows():
    print(f"movie id {row['movie id']}: {row['movie title']}")


Recomendaciones: 
movie id 488: Sunset Blvd. (1950)
movie id 513: Third Man, The (1949)
movie id 529: My Life as a Dog (Mitt liv som hund) (1985)
movie id 639: Tin Drum, The (Blechtrommel, Die) (1979)
movie id 647: Ran (1985)
movie id 664: Paris, Texas (1984)
movie id 811: Thirty-Two Short Films About Glenn Gould (1993)
movie id 855: Diva (1981)
movie id 1021: 8 1/2 (1963)
movie id 1266: Bread and Chocolate (Pane e cioccolata) (1973)
movie id 1404: Withnail and I (1987)


In [15]:
userMovie_ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
num_sv = 10

SVDu = TruncatedSVD(n_components=num_sv, random_state=42)

resultant_umatx = SVDu.fit_transform(userMovie_ratings)
resultant_umatx.shape

(943, 10)

In [17]:
corrUmtx = np.corrcoef(resultant_umatx)

In [18]:
# Choose a random user
user = 1

userids = userMovie_ratings.index 
users_list = list(userids)
user_id = users_list.index(user)
corr_urecom = corrUmtx[user_id]

uids = (corr_urecom > .95) & (corr_urecom < .99)
tmp = list()

for i in range(len(userids[uids])):
    tmp.append((corr_urecom[uids][i].round(2), userids[uids][i]))

print('Usuarios relacionados: ')
sorted(tmp, key=lambda x:x[0], reverse=True)

Usuarios relacionados: 


[(0.97, 249), (0.96, 62), (0.95, 715), (0.95, 916)]

In [19]:
from itertools import product
vector_user1 = userMovie_ratings[1]
vector_user249 = userMovie_ratings[249]

def  filter_vector(vector_user,expr,n):
    tranf = lambda enum : (enum[0],enum[1]) if expr(enum[1],n) else None

    result = list(map(tranf,enumerate(vector_user)))
    result_filter = list(filter(lambda x: x is not None, result))
    return result_filter

vector1 = filter_vector(vector_user1,lambda x,y : True if(x>y) else False,3)
vector2 = filter_vector(vector_user249,lambda x,y : True if(x>y) else False,3)

interseccion = [(x,y) for x, y in list(product(vector1,vector2)) if x[0] == y[0]]
print(f"len v1 {len(vector1)}")
print(f"len v2 {len(vector2)}")
print(f"Interseccion: \n{interseccion}\nlen_iterseccion:\n {len(interseccion)}")

dif = [x for x, y in list(product(vector1,vector2)) if x[0] != y[0]]

print(dif)

len v1 321
len v2 54
Interseccion: 
[((0, 5), (0, 4)), ((43, 4), (43, 4)), ((56, 5), (56, 5)), ((57, 5), (57, 4)), ((116, 4), (116, 4)), ((129, 5), (129, 5)), ((243, 4), (243, 4)), ((248, 4), (248, 4)), ((250, 4), (250, 5)), ((275, 5), (275, 4)), ((286, 5), (286, 5)), ((290, 5), (290, 4)), ((293, 5), (293, 5)), ((302, 5), (302, 4)), ((346, 4), (346, 5)), ((434, 5), (434, 4)), ((496, 4), (496, 5)), ((541, 4), (541, 4)), ((559, 4), (559, 5)), ((591, 4), (591, 4)), ((641, 5), (641, 5)), ((653, 4), (653, 5)), ((664, 4), (664, 5)), ((714, 5), (714, 4)), ((805, 4), (805, 4)), ((837, 5), (837, 4)), ((879, 4), (879, 4)), ((935, 4), (935, 5))]
len_iterseccion:
 28
[(0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), (0, 5),

In [20]:
#vector1 contine todos las movies que tienen mas de 5 puntos dados por user 1

vector_empty = filter_vector(vector_user1,lambda x,y : True if(x == y) else False,0)

inters_vector = [(x,y) for x, y in list(product(vector2,vector_empty)) if x[0] == y[0]]

print(inters_vector)
print(len(inters_vector))

[((23, 4), (23, 0)), ((31, 4), (31, 0)), ((158, 4), (158, 0)), ((232, 5), (232, 0)), ((254, 5), (254, 0)), ((436, 5), (436, 0)), ((554, 4), (554, 0)), ((583, 4), (583, 0)), ((639, 4), (639, 0)), ((757, 4), (757, 0)), ((858, 5), (858, 0)), ((868, 4), (868, 0))]
12


In [21]:
import numpy as np
from itertools import product

# Definir dos arrays
array1 = np.array([(2,3),(3,3)])
array2 = np.array([(4,5),(5,2),(2,2)])

# Obtener la intersección de los elementos de los arrays
interseccion = [(x,y) for x, y in list(product(array1, array2)) if x[0] == y[0]]

print(interseccion)  # Salida: [3, 4, 5]


[(array([2, 3]), array([2, 2]))]


In [28]:
numeros = [True,True,True,False]

# Verificar si todos los números en la lista son mayores que cero
resultado = all(numeros)
print(resultado)

False
