## TF-IDF recommend systems
### Playground for intensive, day 1

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances, cosine_distances


## Local strings vectorizing and making similarity matrix

In [46]:
local_strings = [
    'hello world',
    'goodbye world',
    'hello universe',
    'goodbye universe',
    'hello goodbye',
    'hey hey hey',
    'hello hello hello',
]

local_tfidf = TfidfVectorizer(stop_words='english')
local_tfidf_matrix = local_tfidf.fit_transform(local_strings)

local_df = pd.DataFrame(local_tfidf_matrix.toarray(), columns=local_tfidf.get_feature_names_out(), index=local_strings)
local_df

Unnamed: 0,goodbye,hello,hey,universe,world
hello world,0.0,0.59594,0.0,0.0,0.803029
goodbye world,0.64975,0.0,0.0,0.0,0.760148
hello universe,0.0,0.59594,0.0,0.803029,0.0
goodbye universe,0.64975,0.0,0.0,0.760148,0.0
hello goodbye,0.755113,0.655595,0.0,0.0,0.0
hey hey hey,0.0,0.0,1.0,0.0,0.0
hello hello hello,0.0,1.0,0.0,0.0,0.0


In [47]:
cosine_sim = cosine_similarity(local_tfidf_matrix)

cosine_sim_df = pd.DataFrame(cosine_sim, columns=local_strings, index=local_strings)
cosine_sim_df

Unnamed: 0,hello world,goodbye world,hello universe,goodbye universe,hello goodbye,hey hey hey,hello hello hello
hello world,1.0,0.610421,0.355145,0.0,0.390695,0.0,0.59594
goodbye world,0.610421,1.0,0.0,0.422175,0.490634,0.0,0.0
hello universe,0.355145,0.0,1.0,0.610421,0.390695,0.0,0.59594
goodbye universe,0.0,0.422175,0.610421,1.0,0.490634,0.0,0.0
hello goodbye,0.390695,0.490634,0.390695,0.490634,1.0,0.0,0.655595
hey hey hey,0.0,0.0,0.0,0.0,0.0,1.0,0.0
hello hello hello,0.59594,0.0,0.59594,0.0,0.655595,0.0,1.0


## Books dataset vectorization and similarity
### Getting recommendations by book id

In [19]:
df = pd.read_csv('datasets/book-crossing/Books.csv', sep=';', on_bad_lines='skip')
df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [73]:
user_ratings = pd.read_csv('datasets/book-crossing/users-ratings.csv', on_bad_lines='skip')

df = df[df['ISBN'].isin(user_ratings['ISBN'])]
user_ratings = user_ratings[user_ratings['ISBN'].isin(df['ISBN'])]

df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
0,440234743,the testament,john grisham,1999,dell,the testament john grisham dell
1,452264464,beloved (plume contemporary fiction),toni morrison,1994,plume,beloved (plume contemporary fiction) toni morr...
2,971880107,wild animus,rich shapero,2004,too far,wild animus rich shapero too far
3,345402871,airframe,michael crichton,1997,ballantine books,airframe michael crichton ballantine books
4,345417623,timeline,michael crichton,2000,ballantine books,timeline michael crichton ballantine books


In [74]:
# Drop rows with missing values
df.dropna(subset=['Title', 'Author'], inplace=True)

# To lower case title and author
df['Title'] = df['Title'].str.lower()
df['Author'] = df['Author'].apply(lambda x: x.lower())
df['Publisher'] = df['Publisher'].apply(lambda x: x.lower())

# Create a new column with title and author concatenated, and empty values filled with empty string
df['text'] = df['Title'].fillna('') + ' ' + df['Author'].fillna('') + ' ' + df['Publisher'].fillna('')

# Drop duplicates
df.drop_duplicates(subset=['text'], inplace=True)

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
0,440234743,the testament,john grisham,1999,dell,the testament john grisham dell
1,452264464,beloved (plume contemporary fiction),toni morrison,1994,plume,beloved (plume contemporary fiction) toni morr...
2,971880107,wild animus,rich shapero,2004,too far,wild animus rich shapero too far
3,345402871,airframe,michael crichton,1997,ballantine books,airframe michael crichton ballantine books
4,345417623,timeline,michael crichton,2000,ballantine books,timeline michael crichton ballantine books


In [75]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])

# Cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim.shape

# Other similarity metrics

euclidean_sim = euclidean_distances(tfidf_matrix)

manhattan_sim = manhattan_distances(tfidf_matrix)

In [144]:
def get_recommendations(isbn, cosine_sim=cosine_sim, top_n=10):
    idx = df[df['ISBN'] == isbn].index
    item = df[df['ISBN'] == isbn]['text']
    print(f'Looking recommendations for {item} book')
   
    if len(idx) == 0:
        return None
    
    idx = idx[0]

    # Get similarity scores for the book, enumerate to keep track of the index
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top n most similar books
    sim_scores = sim_scores[:top_n]
    # sim_scores = sim_scores[1:top_n+1]
    
    book_indices = [i[0] for i in sim_scores]

    new_df = df.iloc[book_indices].copy()
    new_df['similarity'] = [i[1] for i in sim_scores]

    return new_df

In [77]:
get_recommendations('0452264464')

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text,similarity
1,0452264464,beloved (plume contemporary fiction),toni morrison,1994,plume,beloved (plume contemporary fiction) toni morr...,1.0
1892,0452269652,jazz (plume contemporary fiction),toni morrison,1993,plume books,jazz (plume contemporary fiction) toni morriso...,0.835105
978,0452280621,beloved,toni morrison,1998,plume books,beloved toni morrison plume books,0.815703
1614,0452282195,the bluest eye,toni morrison,2000,plume books,the bluest eye toni morrison plume books,0.527401
730,0452268060,how the garcia girls lost their accents (plume...,julia alvarez,1992,plume books,how the garcia girls lost their accents (plume...,0.496464
583,0452260116,song of solomon (oprah's book club (paperback)),toni morrison,1987,plume books,song of solomon (oprah's book club (paperback)...,0.453765
1716,0452284295,cheet (plume books),anna davis,2003,plume books,cheet (plume books) anna davis plume books,0.383541
1272,0679433740,paradise,toni morrison,1998,alfred a. knopf,paradise toni morrison alfred a. knopf,0.333949
255,0140077022,white noise (contemporary american fiction),don delillo,1991,penguin books,white noise (contemporary american fiction) do...,0.247786
254,014016930X,angle of repose (contemporary american fiction),wallace stegner,1992,penguin books,angle of repose (contemporary american fiction...,0.230597


### Getting recommendations by user id

In [176]:
def get_recommendations_for_user(user_id, top_n=10):
    selected_user_ratings = user_ratings[user_ratings['User-ID'] == user_id]

    print(f'User {user_id} has rated {selected_user_ratings.shape[0]} books')

    selected_user_ratings_part = selected_user_ratings.sample(frac=0.7)
    selected_user_ratings_other_part = selected_user_ratings.drop(selected_user_ratings_part.index)

    # Sort by rating
    selected_user_ratings_part = selected_user_ratings_part.sort_values(by=["Rating"], ascending=False)

    # Took first 10 books rated by the user
    selected_user_ratings_part = selected_user_ratings_part.head(3)

    recommendations = pd.DataFrame()

    for isbn in selected_user_ratings_part['ISBN']:
        recommendations = pd.concat([recommendations, get_recommendations(isbn, top_n=3)], ignore_index=True)

    print(f'Found {recommendations.shape} recommendations')
    
    recommendations = recommendations[~recommendations['ISBN'].isin(selected_user_ratings_part['ISBN'])]
    recommendations = recommendations.drop_duplicates(subset=['ISBN'])

    # Check how many recommended books the user has already rated in the other part of the dataset
    rated_recommendations = recommendations[~recommendations['ISBN'].isin(selected_user_ratings_other_part['ISBN'])]
    print(f'User has already rated {rated_recommendations.shape[0]}/{recommendations.shape[0]} recommended books')

    return recommendations.head(top_n)


In [131]:
random_user = user_ratings.sample()
random_user

Unnamed: 0,User-ID,Age,ISBN,Rating
64293,132083,47,385504209,5


In [193]:
random_user = user_ratings.sample()
get_recommendations_for_user(random_user['User-ID'].iloc[0])

User 123883 has rated 81 books
Looking recommendations for 1127    truly, madly manhattan nora roberts silhouette
Name: text, dtype: object book
Looking recommendations for 1257    summer pleasures nora roberts silhouette
Name: text, dtype: object book
Looking recommendations for 783    heaven and earth (three sisters island trilogy...
Name: text, dtype: object book
Found (9, 7) recommendations
User has already rated 3/4 recommended books


Unnamed: 0,ISBN,Title,Author,Year,Publisher,text,similarity
1,0373484410,time and again,nora roberts,2001,silhouette,time and again nora roberts silhouette,0.468103
2,0373218400,table for two,nora roberts,2002,silhouette,table for two nora roberts silhouette,0.415525
7,051513287X,face the fire (three sisters island trilogy),nora roberts,2002,jove books,face the fire (three sisters island trilogy) n...,0.721635
8,0515131229,dance upon the air (three sisters island trilogy),nora roberts,2003,jove books,dance upon the air (three sisters island trilo...,0.657861
