### Laboratorio 6 - Deep learning

- Alejandra Guzmán 20562
- Jorge Caballeros 20009



In [2]:
import pandas as pd

users_df = pd.read_csv('Users.csv')
books_df = pd.read_csv('Books.csv')
ratings_df = pd.read_csv('Ratings.csv')

users_head = users_df.head()
books_head = books_df.head()
ratings_head = ratings_df.head()

(users_head, books_head, ratings_head)


  books_df = pd.read_csv('Books.csv')


(   User-ID                            Location   Age
 0        1                  nyc, new york, usa   NaN
 1        2           stockton, california, usa  18.0
 2        3     moscow, yukon territory, russia   NaN
 3        4           porto, v.n.gaia, portugal  17.0
 4        5  farnborough, hants, united kingdom   NaN,
          ISBN                                         Book-Title  \
 0  0195153448                                Classical Mythology   
 1  0002005018                                       Clara Callan   
 2  0060973129                               Decision in Normandy   
 3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
 4  0393045218                             The Mummies of Urumchi   
 
             Book-Author Year-Of-Publication                   Publisher  \
 0    Mark P. O. Morford                2002     Oxford University Press   
 1  Richard Bruce Wright                2001       HarperFlamingo Canada   
 2          Carlo D'Este       

In [3]:
# EDA - Users Dataset
users_info = {
    "Users Summary": users_df.describe(include='all'),
    "Users Missing Values": users_df.isnull().sum(),
    "Unique Locations": users_df['Location'].nunique(),
    "Age Distribution": users_df['Age'].describe()
}

# EDA - Books Dataset
books_info = {
    "Books Summary": books_df.describe(include='all'),
    "Books Missing Values": books_df.isnull().sum(),
    "Unique Authors": books_df['Book-Author'].nunique(),
    "Publication Year Distribution": books_df['Year-Of-Publication'].describe()
}

# EDA - Ratings Dataset
ratings_info = {
    "Ratings Summary": ratings_df.describe(),
    "Ratings Missing Values": ratings_df.isnull().sum(),
    "Rating Distribution": ratings_df['Book-Rating'].value_counts()
}

(users_info, books_info, ratings_info)


({'Users Summary':              User-ID                         Location            Age
  count   278858.00000                           278858  168096.000000
  unique           NaN                            57339            NaN
  top              NaN  london, england, united kingdom            NaN
  freq             NaN                             2506            NaN
  mean    139429.50000                              NaN      34.751434
  std      80499.51502                              NaN      14.428097
  min          1.00000                              NaN       0.000000
  25%      69715.25000                              NaN      24.000000
  50%     139429.50000                              NaN      32.000000
  75%     209143.75000                              NaN      44.000000
  max     278858.00000                              NaN     244.000000,
  'Users Missing Values': User-ID          0
  Location         0
  Age         110762
  dtype: int64,
  'Unique Locations': 57339

### PreProcesamiento de la data

In [4]:
# Users Dataset Preprocessing
# Remove unrealistic ages
users_df_cleaned = users_df[(users_df.Age >= 5) & (users_df.Age <= 100)]

# Fill missing ages with median value of the cleaned dataset
median_age = users_df_cleaned.Age.median()
users_df_cleaned.Age.fillna(median_age, inplace=True)

# Books Dataset Preprocessing
# Convert 'Year-Of-Publication' to numeric, coerce errors to NaN, and then convert NaNs to the median year
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
median_year = books_df['Year-Of-Publication'].median()
books_df['Year-Of-Publication'].fillna(median_year, inplace=True)
books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype(int)

# Replace missing values in 'Book-Author' and 'Publisher' with 'Unknown'
books_df['Book-Author'].fillna('Unknown', inplace=True)
books_df['Publisher'].fillna('Unknown', inplace=True)

# Ratings Dataset Preprocessing
# For now, let's keep the ratings as they are, including the 0 ratings, and decide on the approach when building the recommendation system

# Checking the results of the preprocessing
preprocessed_data = {
    "Users Cleaned": users_df_cleaned.head(),
    "Users Missing Ages After Cleaning": users_df_cleaned['Age'].isnull().sum(),
    "Books Cleaned": books_df.head(),
    "Books Missing Values After Cleaning": books_df.isnull().sum()
}

preprocessed_data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_df_cleaned.Age.fillna(median_age, inplace=True)


{'Users Cleaned':     User-ID                        Location   Age
 1         2       stockton, california, usa  18.0
 3         4       porto, v.n.gaia, portugal  17.0
 5         6   santa monica, california, usa  61.0
 9        10      albacete, wisconsin, spain  26.0
 10       11  melbourne, victoria, australia  14.0,
 'Users Missing Ages After Cleaning': 0,
 'Books Cleaned':          ISBN                                         Book-Title  \
 0  0195153448                                Classical Mythology   
 1  0002005018                                       Clara Callan   
 2  0060973129                               Decision in Normandy   
 3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
 4  0393045218                             The Mummies of Urumchi   
 
             Book-Author  Year-Of-Publication                   Publisher  \
 0    Mark P. O. Morford                 2002     Oxford University Press   
 1  Richard Bruce Wright                 2001   

### Sistema de recomendacion basado en contenido

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load a smaller subset of the Books dataset for content-based recommendation system
# Due to memory constraints, we will work with the first 1000 books
books_df_subset = pd.read_csv('Books.csv', nrows=1000)

# Use TF-IDF to convert the 'Book-Title' into a matrix of TF-IDF features
tfidf_vectorizer_subset = TfidfVectorizer(stop_words='english')
tfidf_matrix_subset = tfidf_vectorizer_subset.fit_transform(books_df_subset['Book-Title'].fillna(''))

# Compute the cosine similarity matrix based on the tfidf_matrix
cosine_sim_matrix_subset = linear_kernel(tfidf_matrix_subset, tfidf_matrix_subset)

# Function to get recommendations based on the cosine similarity score of book titles
def get_content_based_recommendations_subset(title, cosine_sim=cosine_sim_matrix_subset):
    # Get the index of the book that matches the title
    idx = books_df_subset.index[books_df_subset['Book-Title'] == title].tolist()[0]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return books_df_subset['Book-Title'].iloc[book_indices]

# Testing the content-based recommendation function with a book title from the subset dataset
test_book_title_subset = books_df_subset['Book-Title'].iloc[0]  # Get the first book title for testing
content_based_recommendations_subset = get_content_based_recommendations_subset(test_book_title_subset)

content_based_recommendations_subset


1                                          Clara Callan
2                                  Decision in Normandy
3     Flu: The Story of the Great Influenza Pandemic...
4                                The Mummies of Urumchi
5                                The Kitchen God's Wife
6     What If?: The World's Foremost Military Histor...
7                                       PLEADING GUILTY
8     Under the Black Flag: The Romance and the Real...
9               Where You'll Find Me: And Other Stories
10                          Nights Below Station Street
Name: Book-Title, dtype: object

In [6]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Load a smaller subset of the Ratings dataset for collaborative filtering recommendation system
# We will work with the first 1000 ratings to fit within memory constraints
ratings_df_subset = pd.read_csv('Ratings.csv', nrows=1000)

# Create the user-item interaction matrix
interaction_matrix = ratings_df_subset.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)

# Convert the user-item interaction matrix to a sparse matrix format to save memory
sparse_interaction_matrix = csr_matrix(interaction_matrix.values)

# Initialize and fit the NearestNeighbors model (which is a memory-based collaborative filtering model)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(sparse_interaction_matrix)

# Function to get book recommendations for a user using the collaborative filtering model
def get_collaborative_filtering_recommendations(user_id, interaction_matrix, model_knn):
    # Get the index of the user for which the recommendations are to be made
    user_idx = interaction_matrix.index.tolist().index(user_id)

    # Get the user interaction vector for the user
    user_interaction_vector = interaction_matrix.values[user_idx, :].reshape(1, -1)

    # Get the nearest neighbors (i.e., similar users)
    distances, indices = model_knn.kneighbors(user_interaction_vector, n_neighbors=10)

    # Initialize a list to store the ISBNs of the recommended books
    recommended_books = []

    # Loop through the indices of the neighbors
    for idx in indices.flatten():
        # Find books that the neighbor has rated 5 stars
        high_rated_books = interaction_matrix.columns[interaction_matrix.iloc[idx] == 5].tolist()
        recommended_books.extend(high_rated_books)

    # Remove books that the user has already rated
    rated_books = interaction_matrix.columns[interaction_matrix.iloc[user_idx] > 0].tolist()
    recommended_books = list(set(recommended_books) - set(rated_books))

    # Return the recommended books
    return recommended_books

# Testing the collaborative filtering recommendation function with a user from the subset dataset
test_user_id_subset = ratings_df_subset['User-ID'].iloc[0]  # Get the first user ID for testing
collaborative_filtering_recommendations_subset = get_collaborative_filtering_recommendations(
    test_user_id_subset, interaction_matrix, model_knn
)

collaborative_filtering_recommendations_subset


['342677609X', '0805047379']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine the 'Book-Author' and 'Publisher' columns into a single string for each book
# We'll fill any NaN values with empty strings to ensure the vectorizer works correctly
books_df['content'] = books_df['Book-Author'].fillna('') + ' ' + books_df['Publisher'].fillna('')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Create the TF-IDF feature matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(books_df['content'])

# Display the shape of the TF-IDF matrix to understand its size
tfidf_matrix.shape


(271360, 54504)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Select a random subset of the books to avoid MemoryError
# We'll take a subset of 5000 books for the demonstration
subset_size = 5000
books_subset = books_df.sample(n=subset_size, random_state=42).reset_index(drop=True)

# Create the TF-IDF feature matrix for the subset
tfidf_matrix_subset = tfidf_vectorizer.transform(books_subset['content'])

# Compute the cosine similarity matrix from the TF-IDF matrix for the subset
cosine_sim_matrix_subset = cosine_similarity(tfidf_matrix_subset, tfidf_matrix_subset)

# Function to get book recommendations based on content similarity for the subset
def get_content_based_recommendations_subset(book_idx, books_subset, cosine_sim_matrix, top_n=10):
    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim_matrix[book_idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top-n most similar books
    sim_scores = sim_scores[1:top_n+1]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top-n most similar books
    return books_subset.iloc[book_indices]

# Test the function with an example book index from our subset
example_index = 0  # Using the first book in the subset for testing
recommended_books_subset = get_content_based_recommendations_subset(example_index, books_subset, cosine_sim_matrix_subset)
recommended_books_subset[['ISBN', 'Book-Title', 'Book-Author', 'Publisher']]

Unnamed: 0,ISBN,Book-Title,Book-Author,Publisher
1599,671898213,Baedeker London (Baedekers City Guides),SONS,Macmillan General Reference
1302,394826256,Walt Disney's Snow White and the Seven Dwarfs ...,Random House Editors,Random House Childrens Books
1307,439148790,Can You Hear a Shout in Space?: Questions and ...,Melvin Berger,Scholastic Reference
4675,553502662,Disobedience,Jane Hamilton,Random House Audio Publishing Group
2349,517035820,Agatha Christie: Five Classic Murder Mysteries,Agatha Christie,Random House Value Publishing
429,679431152,Disclosure,Michael Crichton,Random House Audio Publishing Group
1294,553473425,Bbc Presents: Middlemarch,George Eliot,Random House Audio Publishing Group
2159,671850245,The Complete Vampire Companion,Rosemary Ellen Guiley,Macmillan General Reference
3541,679412093,The Trail Home: Essays,John Daniel,Random House Inc
4282,679458182,Personal History,Katharine Graham,Random House Audio Publishing Group
