In [1]:
import tarfile
import pandas as pd
import numpy as np
import time

In [2]:
# Leer el archivo comprimido con datos de reseñas de libros.
with tarfile.open("./files/lthing_data.tar.gz") as tar:
    # Listar los archivos contenidos en el archivo tar.
    print("Files in tar archive:")
    tar.list()
    # Extraer y mostrar algunos registros de ejemplo de las reseñas.
    print("\nSample records:")
    with tar.extractfile("lthing_data/reviews.txt") as file:
        count = 0
        for line in file:
            print(line)
            count += 1
            if count > 5:
                break

Files in tar archive:
?rwxr-xr-x jmcauley/users          0 2014-01-02 12:57:42 lthing_data/ 
?rw-r--r-- jmcauley/users    4824989 2014-01-02 12:55:12 lthing_data/edges.txt 
?rw-r--r-- jmcauley/users 1665980007 2014-01-02 12:55:09 lthing_data/reviews.txt 

Sample records:
b'reviews = {}\n'
b'reviews[(\'73960\', \'Elizabeth.Wong98\')] = {\'comment\': "Every evening, the brave queen of Persia, Shahrazad, goes into the Sultan\'s rooms and begins a weave of words, hoping to entice the Sultan to let her live another night so she can continue her story. But Shahrazad has a big problem: She is running out of stories. With a delicate stroke of luck, a cripple girl, Marjan, ventures into the harem with her Aunt to sell various wares. As her Aunt is selling things, Marjan entertains some of the children with a story. Unbeknownst to Marjan, Shahrazad\'s sister, Dunyazad, has heard her telling the story. Marjan is taken directly to Shahrazad and asked to recount the story. The tale keeps the queen 

In [3]:
# Crea una lista vacía para almacenar los datos
data = []

# inicializamos un timer
start_time = time.time()

# Abre el archivo reviews.txt y léelo línea por línea
with open('./files/lthing_data/reviews.txt', 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Divide la línea en partes usando '=' como separador
            parts = line.strip().split('=')
            if len(parts) == 2:
                # Extrae el usuario, la obra y las estrellas de cada revisión
                review_info = eval(parts[1])            
                user = review_info['user'] if 'user' in review_info else None
                work = review_info['work'] if 'work' in review_info else None
                stars = review_info['stars'] if 'stars' in review_info else None
                if user and work and stars:
                    # Almacena la información en un diccionario
                    data.append({'user': user, 'work': work, 'stars': stars})                
        except Exception as e:
            print(e)
            
end_time = time.time()            
# mostrar el tiempo de ejcucion
print("Tiempo de ejecución: ", end_time - start_time)

Tiempo de ejecución:  149.9880747795105


In [4]:
# Convertir la lista de reseñas en un DataFrame y mostrar algunos ejemplos.
reviews = pd.DataFrame(data, columns=["user", "work", "stars"])

# Muestra el DataFrame
print(reviews.head())

# Muestra la cantidad total de reseñas
print(f"\nTotal reviews: {len(reviews):,}")

               user      work  stars
0  Elizabeth.Wong98     73960    4.5
1            rivkat     69413    3.0
2      suz.haugland   9523995    4.0
3        amoskovacs    368228    4.0
4            CandyH  11243828    4.0

Total reviews: 1,377,255


In [5]:
# Análisis de usuarios y libros populares.
# Contar cuántos libros ha reseñado cada usuario.
usercount = reviews[["work","user"]].groupby("user").count()
usercount = usercount[usercount["work"] >= 50]

# Mostar los usuarios que han reseñado más de 50 libros
print(usercount.head())

              work
user              
-Eva-          600
06nwingert     370
1983mk          63
1dragones      193
1morechapter   271


In [6]:
# Contar cuántos usuarios han reseñado cada libro.
workcount = reviews[["work","user"]].groupby("work").count()
workcount = workcount[workcount["user"] >= 50]

# Mostar los libros que han sido reseñados por más de 50 usuarios
print(workcount.head())

          user
work          
10000      106
10001       53
1000167    185
10001797    53
10005525   132


In [7]:
# Filtrar para mantener solo usuarios y libros populares.
reviews = reviews[reviews["user"].isin(usercount.index) & reviews["work"].isin(workcount.index)]
print("\nSubset of data:")
print(reviews)


Subset of data:
                  user     work  stars
5             miyurose  9071901    2.0
8           Mamajeanne  1110874    5.0
11           funkendub     5852    5.0
12       notmyrealname  3620689    2.0
13           bluetyson     1472    5.0
...                ...      ...    ...
1377230          Jim53    30888    3.5
1377234      lucybrown     3253    3.5
1377235      ElizaJane  7874593    4.5
1377247      heidijane  2129329    4.0
1377253      tamaranth  4873693    4.5

[202485 rows x 3 columns]


In [8]:
# Convertir reseñas en una matriz usuario-libro, llenando valores faltantes con cero.
reviewmatrix = reviews.pivot(index="user", columns="work", values="stars").fillna(0)
matrix = reviewmatrix.values

In [9]:
# Realizar descomposición en valores singulares (SVD) de la matriz de reseñas.
u, s, vh = np.linalg.svd(matrix, full_matrices=False)

In [10]:
# mostrar los valores singulares
print("\nSingular values:")
print(s)


Singular values:
[4.17827635e+02 2.46003247e+02 2.17548010e+02 ... 3.99001819e-15
 3.55187476e-15 3.03051879e-15]


In [11]:
#mostarr los vectores singulares y la matriz diagonal
print("\nSingular vectors:")
print(u)

# mostramos vh que es la matriz diagonal
print("\nDiagonal matrix:")
print(vh)


Singular vectors:
[[-4.41602907e-02  8.77565152e-03  2.76910130e-03 ... -1.28926813e-02
   2.95628700e-05 -1.22760021e-02]
 [-3.54134459e-02  1.73998446e-03 -5.50287679e-02 ...  4.96025108e-03
  -1.23295744e-03  3.11218058e-03]
 [-2.28831320e-03 -6.53862608e-03 -1.15336944e-03 ... -1.71825438e-02
  -1.46216503e-02  6.84758759e-03]
 ...
 [-5.11077396e-04 -4.28113362e-04  7.32949533e-04 ...  4.41406912e-03
  -1.63736720e-03  3.99637521e-03]
 [-1.10253072e-02  8.11922075e-03  1.39688291e-02 ...  2.96531923e-03
  -4.19559738e-03 -1.90772256e-02]
 [-2.26013814e-03 -4.36757596e-03  8.29281424e-04 ...  1.09429257e-02
  -2.14697882e-03  1.47059090e-02]]

Diagonal matrix:
[[-1.45958321e-02 -8.20993943e-03 -1.68208176e-02 ... -1.20821515e-02
  -7.28876193e-03 -9.05405288e-03]
 [-4.54502036e-03 -3.98875605e-03  1.40629407e-02 ... -8.13569186e-03
   5.73117953e-03 -9.10962178e-03]
 [-1.25633639e-03 -4.96339098e-03 -2.74167206e-02 ... -1.31448140e-02
   1.09499221e-02 -6.96069230e-03]
 ...
 [ 0.00

In [12]:
# Función para calcular la similitud de coseno entre dos vectores.
def cosine_similarity(v, u):
    return (v @ u) / (np.linalg.norm(v) * np.linalg.norm(u))

In [13]:
# Encontrar la mayor similitud de coseno entre libros.
highest_similarity = -np.inf
highest_sim_col = -1
for col in range(1, vh.shape[1]):
    similarity = cosine_similarity(vh[:,0], vh[:,col])
    if similarity > highest_similarity:
        highest_similarity = similarity
        highest_sim_col = col

# Imprimir el resultado mostrando qué libro es más similar al primero.
print("Column %d (book id %s) is most similar to column 0 (book id %s)" %
      (highest_sim_col, reviewmatrix.columns[col], reviewmatrix.columns[0])
)

# imprimir el valor de la similitud con 10 decimales
print("Similarity: %.40f" % highest_similarity)


Column 817 (book id 9998) is most similar to column 0 (book id 10000)
Similarity: 0.0000000000000004068468652251858346841361
