In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Generate a synthetic library of books
Define the lists of variables for each column. I do not agree with this genre list. I also can't be asked to make one.

In [5]:
genres = ['Classical Literature', 'Mystery', 'Thriller', 'Horror', 'Historical', 'Romance', 'Western', 'Bildungsroman', 'Science Fiction', 'Fantasy', 'Dystopian', 'Magical Realism', 'Realism']
lengths = ['Short', 'Long', 'Very Long']
periods = ['Old', 'Modern', 'Very Modern']
endings = ['Happy', 'Sad', 'Confusing']

## Create dataframe of books
Make N random books. Create a dictionary with column names as keys and empty lists as values. Loop through N rows and randomly select a feature for each column.

In [46]:
def _books(N, X):
    np.random.seed(X)
    data = {'Book': [], 'Genre': [], 'Length': [], 'Period': [], 'Ending': []} # make dictionary
    for i in range(N):
        data['Book'].append(i+1)
        data['Genre'].append(np.random.choice(genres))
        data['Length'].append(np.random.choice(lengths))
        data['Period'].append(np.random.choice(periods))
        data['Ending'].append(np.random.choice(endings))
    df = pd.DataFrame(data)
    return df

In [47]:
test_books = _books(5,1)
test_books.head(1)

Unnamed: 0,Book,Genre,Length,Period,Ending
0,1,Romance,Short,Old,Sad


## Create dataframe of users
Loop through n users and randomly select a preference for genre and some of the other features.

In [48]:
def _users(N, X):
    np.random.seed(X)
    data = {'User': [], 'Genre': [], 'Length': [], 'Period': [], 'Ending': []} # make dictionary
    for i in range(N):
        data['User'].append(i + 1)
        data['Genre'].append(np.random.choice(genres))
        data['Length'].append(np.random.choice(lengths + [None]))
        data['Period'].append(np.random.choice(periods + [None]))
        data['Ending'].append(np.random.choice(endings + [None]))
    df = pd.DataFrame(data)
    return df

In [49]:
test_users = _users(5,1)
test_users.head(1)

Unnamed: 0,User,Genre,Length,Period,Ending
0,1,Romance,,Old,Happy


## Who's reading what?
Everyone is reading all the books in their preferred genre, as well as some random books.

In [50]:
main_users = _users(30, 1)
main_books = _books(50, 1)

In [51]:
def _reading(N, X):
    np.random.seed(X)
    merger = pd.merge(main_users, main_books, on='Genre')
    random_df = pd.DataFrame({
        'User': [np.random.randint(1, len(main_users)+1) for i in range(N)],
        'Book': [np.random.randint(1, len(main_books)+1) for i in range(N)]})
    df = pd.concat([merger[['User', 'Book']], random_df], ignore_index=True)
    df = df.drop_duplicates()
    secondmerger = pd.merge(df, main_users, on='User')
    df = pd.merge(secondmerger, main_books, on='Book')
    return df

In [52]:
test_reading = _reading(500, 1)
test_reading

Unnamed: 0,User,Book,Genre_x,Length_x,Period_x,Ending_x,Genre_y,Length_y,Period_y,Ending_y
0,1,1,Romance,,Old,Happy,Romance,Short,Old,Sad
1,6,1,Romance,Very Long,Old,,Romance,Short,Old,Sad
2,25,1,Romance,Long,,Confusing,Romance,Short,Old,Sad
3,2,1,Fantasy,,Modern,,Romance,Short,Old,Sad
4,28,1,Fantasy,,,Happy,Romance,Short,Old,Sad
...,...,...,...,...,...,...,...,...,...,...
524,28,48,Fantasy,,,Happy,Magical Realism,Long,Modern,Sad
525,27,48,Bildungsroman,,,,Magical Realism,Long,Modern,Sad
526,23,48,Western,Long,,,Magical Realism,Long,Modern,Sad
527,26,48,Science Fiction,Short,Very Modern,Confusing,Magical Realism,Long,Modern,Sad


## Set up probabilities for how much the user liked the book
"You're a filthy frequentist!"

In [13]:
def _sim(df):
    columns = ['Genre', 'Length', 'Period', 'Ending']
    for col in columns:
        xx = df[col + '_x']
        yy = df[col + '_y']
        df[col] = np.where(pd.isna(xx) | pd.isna(yy), 0, np.where(xx == yy, 1, -1))
    df['sim'] = df[columns].sum(axis=1)
    return df

In [14]:
test_sim = _sim(test_reading)
test_sim.head(2)

Unnamed: 0,User,Book,Genre_x,Length_x,Period_x,Ending_x,Genre_y,Length_y,Period_y,Ending_y,Genre,Length,Period,Ending,sim
0,1,1,Romance,,Old,Happy,Romance,Short,Old,Sad,1,0,1,-1,1
1,6,1,Romance,Very Long,Old,,Romance,Short,Old,Sad,1,-1,1,0,1


In [15]:
def _rating(df, X):
    np.random.seed(X)
    likely = 0.5+df['sim']*0.1
    mean = likely
    std_dev = likely * (1 - likely)   ## last minute add
    rating = np.random.normal(mean, std_dev)
    rating = np.maximum(0, np.minimum(1, rating)).round(2)
    df['rating'] = rating*100
    return df

In [16]:
test_rating = _rating(test_sim,1)
test_rating.head(3)

Unnamed: 0,User,Book,Genre_x,Length_x,Period_x,Ending_x,Genre_y,Length_y,Period_y,Ending_y,Genre,Length,Period,Ending,sim,rating
0,1,1,Romance,,Old,Happy,Romance,Short,Old,Sad,1,0,1,-1,1,99.0
1,6,1,Romance,Very Long,Old,,Romance,Short,Old,Sad,1,-1,1,0,1,45.0
2,25,1,Romance,Long,,Confusing,Romance,Short,Old,Sad,1,-1,0,-1,-1,27.0


In [17]:
test = test_rating[['User','Book','rating']]
test.head(3)

Unnamed: 0,User,Book,rating
0,1,1,99.0
1,6,1,45.0
2,25,1,27.0


In [18]:
matrix = test.pivot(index='Book', columns='User', values='rating')
matrix

User,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,99.0,7.0,,67.0,,45.0,,,,0.0,...,,,,,27.0,,,48.0,,
2,,,,34.0,11.0,,23.0,,,16.0,...,,,25.0,,,15.0,26.0,29.0,,
3,47.0,,,20.0,,,,,19.0,,...,,,,,,,,,,
4,,,,,,,,29.0,,,...,,44.0,53.0,,100.0,36.0,,,,
5,100.0,,7.0,,40.0,47.0,28.0,,,26.0,...,,,,,32.0,0.0,,,,
6,35.0,,,,,,,72.0,,,...,36.0,,,,,43.0,,,28.0,
7,,78.0,,,,,0.0,65.0,,58.0,...,,,22.0,,,73.0,,,,
8,,,,99.0,,39.0,,,19.0,,...,0.0,30.0,,49.0,,,62.0,56.0,,
9,,76.0,,,57.0,39.0,,,84.0,,...,71.0,,,58.0,,,,89.0,,
10,,40.0,,,96.0,,,,74.0,,...,30.0,,,,,,,33.0,48.0,


# Cosine similarity
Because users may be differently harsh?

In [70]:
def _cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

# Traditional Recommender
Make variable called 'preferred_book' representing the user's preferred book. Get the ratings for the preferred book and all other books. Calculate cosine similarity between the preferred book and all other books. Sort the similarities in ascending or descending order. Get the top N most similar books

In [76]:
def _similar(df, preferred_book, N=3):
    user_ratings = df.pivot(index='Book', columns='User', values='rating')
    user_ratings = user_ratings.fillna(0)
    preferred_book_ratings = user_ratings[preferred_book].values.reshape(1, -1)
    book_ratings = user_ratings.drop(columns=[preferred_book]).values
    
    similarities = _cosine_similarity(preferred_book_ratings, book_ratings)[0]
    
    sorted_indices = np.argsort(similarities)[::1]
    sorted_similarities = similarities[sorted_indices]
    
    top_books = user_ratings.drop(columns=[preferred_book]).columns[sorted_indices][:N]
    top_similarities = sorted_similarities[:N]
    list1 = [preferred_book]
    list2 = top_books.tolist()
    list3 = list1 + list2
    return list3

In [83]:
topbooks = _similar(test_rating,10)
topbooks

[10, 12, 13, 23]

In [84]:
backcheck = main_books[main_books['Book'].isin(topbooks)].reset_index(drop=True)
backcheck

Unnamed: 0,Book,Genre,Length,Period,Ending
0,10,Fantasy,Long,Old,Happy
1,12,Horror,Very Long,Modern,Happy
2,13,Bildungsroman,Very Long,Modern,Sad
3,23,Classical Literature,Short,Modern,Sad


# Anti-Recommender
Make variable called 'preferred_book' representing the user's preferred book. Get the ratings for the preferred book and all other books. Calculate cosine similarity between the preferred book and all other books. Sort the similarities in ascending or descending order. Get the top N most similar books

In [85]:
def _dissimilar(df, preferred_book, N=3):
    user_ratings = df.pivot(index='Book', columns='User', values='rating')
    user_ratings = user_ratings.fillna(0)
    preferred_book_ratings = user_ratings[preferred_book].values.reshape(1, -1)
    book_ratings = user_ratings.drop(columns=[preferred_book]).values
    
    similarities = _cosine_similarity(preferred_book_ratings, book_ratings)[0]
    
    sorted_indices = np.argsort(similarities)
    sorted_similarities = similarities[sorted_indices]

    top_books = user_ratings.drop(columns=[preferred_book]).columns[sorted_indices][:N]
    top_similarities = sorted_similarities[:N]
    list1 = [preferred_book]
    list2 = top_books.tolist()
    list3 = list1 + list2
    return list3

In [86]:
bottombooks = _dissimilar(test_rating,10)
bottombooks

[10, 12, 13, 23]

In [87]:
backcheck2 = main_books[main_books['Book'].isin(bottombooks)].reset_index(drop=True)
backcheck2

Unnamed: 0,Book,Genre,Length,Period,Ending
0,10,Fantasy,Long,Old,Happy
1,12,Horror,Very Long,Modern,Happy
2,13,Bildungsroman,Very Long,Modern,Sad
3,23,Classical Literature,Short,Modern,Sad
