In [1]:
import pandas as pd

df=pd.read_csv("books_and_genres.csv")

In [None]:
pip install langdetect

In [22]:
from langdetect import detect

# Ensure non-string values like NaN are handled before applying the detection function
df['lang'] = df['text'].apply(lambda x: detect(x) if isinstance(x, str) else 'unknown')

# Filter the DataFrame to keep only English ('en') books
df = df[df['lang'] == 'en']

# Drop the 'lang' column 
df.drop(columns=['lang'], inplace=True)

# Check how many books were removed
print(f"{len(df)} books remain after filtering for English language.")


9102 books remain after filtering for English language.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['lang'], inplace=True)


In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,genres
0,0,apocolocyntosis,"Produced by Ted Garvin, Ben Courtney and PG Di...","{'21st-century', 'history', 'roman', 'classics..."
1,1,the house on the borderland,"Produced by Suzanne Shell, Sjaani and PG Distr...","{'horror', 'mystery', 'classics', 'science-fic..."
2,2,the warriors,"Produced by Charles Aldarondo, Charlie Kirschn...","{'literary-fiction', 'history', 'biography', '..."
3,3,a voyage to the moon,"Produced by Christine De Ryck, Stig M. Valstad...","{'20th-century', 'science-fiction', 'speculati..."
4,4,la fiammetta,"Produced by Ted Garvin, Dave Morgan and PG Dis...","{'literary-fiction', 'history', 'feminism', 'c..."


In [27]:
import re
def clean_text(text):
    # Remove curly braces and single quotes using regex
    text = re.sub(r"[{}']", '', text)
    # Remove commas and extra spaces
  
    return text.strip()  # Return the cleaned text without leading/trailing spaces

# Apply the function to the entire column
df['cleaned_text'] = df['genres'].apply(clean_text)



In [28]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [genre.strip() for genre in x.split(',')])

# Step 2: Get a set of unique genres across the entire DataFrame
unique_genres = set(genre for sublist in df['cleaned_text'] for genre in sublist)

# Step 3: Create one-hot encoded columns for each unique genre
for genre in unique_genres:
    df[genre] = df['cleaned_text'].apply(lambda x: 1 if genre in x else 0)


In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,genres,cleaned_text,novella,education,romance,suspense,college,...,romantic-suspense,mythology,realistic-fiction,science-fiction,relationships,historical-fiction,bdsm,feminism,love,business
0,0,apocolocyntosis,"Produced by Ted Garvin, Ben Courtney and PG Di...","{'21st-century', 'history', 'roman', 'classics...","[21st-century, history, roman, classics, relig...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,the house on the borderland,"Produced by Suzanne Shell, Sjaani and PG Distr...","{'horror', 'mystery', 'classics', 'science-fic...","[horror, mystery, classics, science-fiction, f...",0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2,the warriors,"Produced by Charles Aldarondo, Charlie Kirschn...","{'literary-fiction', 'history', 'biography', '...","[literary-fiction, history, biography, family,...",0,0,0,0,1,...,0,1,0,0,0,1,0,1,0,0
3,3,a voyage to the moon,"Produced by Christine De Ryck, Stig M. Valstad...","{'20th-century', 'science-fiction', 'speculati...","[20th-century, science-fiction, speculative-fi...",0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4,la fiammetta,"Produced by Ted Garvin, Dave Morgan and PG Dis...","{'literary-fiction', 'history', 'feminism', 'c...","[literary-fiction, history, feminism, classics...",0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [30]:
from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_similarity


In [31]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'genres', 'cleaned_text', 'novella',
       'education', 'romance', 'suspense', 'college',
       ...
       'romantic-suspense', 'mythology', 'realistic-fiction',
       'science-fiction', 'relationships', 'historical-fiction', 'bdsm',
       'feminism', 'love', 'business'],
      dtype='object', length=105)

In [32]:
genre_columns = df.drop(columns=['Unnamed: 0', 'title', 'text', 'genres', 'cleaned_text'], axis=1)

In [33]:
genre_columns.head()

Unnamed: 0,novella,education,romance,suspense,college,horror,literature,speculative-fiction,animals,sports,...,romantic-suspense,mythology,realistic-fiction,science-fiction,relationships,historical-fiction,bdsm,feminism,love,business
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,1,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
cosine_sim=cosine_similarity(genre_columns)

In [35]:
cosine_sim_df_genre=pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])
cosine_sim_df_genre.head()

title,apocolocyntosis,the house on the borderland,the warriors,a voyage to the moon,la fiammetta,carmilla,the mystery,the mountains of california,beneath the banner,gaslight sonatas,...,way of the lawless,an explanation of luthers small catechism,the extra day,young canadas nursery rhymes,the young explorer,young folks history,yollop,sketches of young couples,the yosemite,the eye of zeitoon
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
apocolocyntosis,1.0,0.267261,0.433013,0.0,0.5547,0.319801,0.334077,0.375,0.25,0.0,...,0.288675,0.441942,0.223607,0.375,0.0,0.288675,0.288675,0.5,0.334077,0.088388
the house on the borderland,0.267261,1.0,0.360041,0.46291,0.2965,0.683763,0.5,0.267261,0.267261,0.188982,...,0.308607,0.188982,0.597614,0.267261,0.0,0.308607,0.154303,0.267261,0.285714,0.283473
the warriors,0.433013,0.360041,1.0,0.111111,0.587137,0.492366,0.46291,0.529238,0.19245,0.136083,...,0.222222,0.272166,0.344265,0.19245,0.0,0.333333,0.222222,0.44905,0.514344,0.272166
a voyage to the moon,0.0,0.46291,0.111111,1.0,0.160128,0.123091,0.154303,0.144338,0.0,0.408248,...,0.0,0.0,0.258199,0.0,0.0,0.333333,0.0,0.19245,0.154303,0.204124
la fiammetta,0.5547,0.2965,0.587137,0.160128,1.0,0.295656,0.370625,0.485363,0.27735,0.0,...,0.320256,0.392232,0.372104,0.416025,0.0,0.480384,0.160128,0.46225,0.44475,0.196116


In [36]:
search_title = 'GATSBY'  # Replace with the title you're searching for
book_found = df[df['title'].str.contains(search_title, case=False, na=False)]

# Check if any book matches
if not book_found.empty:
    print(f"Book '{search_title}' found in the database:")
    print(book_found)
else:
    print(f"Book '{search_title}' not found in the database.")

Book 'GATSBY' not found in the database.


In [37]:
# BASED ON GENRE
target_book = 'pride and prejudice'

# Rank books by similarity to the target book
similar_books = cosine_sim_df_genre[target_book].sort_values(ascending=False)

top_10_books = similar_books[1:11]  # Skip the first one since it's the target book itself
S
# Print the top 10 books
print(top_10_books)

title
sense and sensibility        0.914659
persuasion                   0.912871
the mayor of casterbridge    0.912871
jan                          0.895979
rome                         0.865181
great expectations           0.857493
tales                        0.857493
a tale of two cities         0.857493
silas marner                 0.852013
the professor                0.849837
Name: pride and prejudice, dtype: float64


In [38]:
df_og=pd.read_csv("books_and_genres.csv")
df_og.head()

Unnamed: 0.1,Unnamed: 0,title,text,genres
0,0,apocolocyntosis,"Produced by Ted Garvin, Ben Courtney and PG Di...","{'21st-century', 'history', 'roman', 'classics..."
1,1,the house on the borderland,"Produced by Suzanne Shell, Sjaani and PG Distr...","{'horror', 'mystery', 'classics', 'science-fic..."
2,2,the warriors,"Produced by Charles Aldarondo, Charlie Kirschn...","{'literary-fiction', 'history', 'biography', '..."
3,3,a voyage to the moon,"Produced by Christine De Ryck, Stig M. Valstad...","{'20th-century', 'science-fiction', 'speculati..."
4,4,la fiammetta,"Produced by Ted Garvin, Dave Morgan and PG Dis...","{'literary-fiction', 'history', 'feminism', 'c..."


In [42]:
# Create a new column 'word_count' that counts the number of words in the 'text' column
df_og['word_count'] = df_og['text'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)

# Display the first few rows to check the result
print(df_og[['text', 'word_count']].head())


                                                text  word_count
0  Produced by Ted Garvin, Ben Courtney and PG Di...        5572
1  Produced by Suzanne Shell, Sjaani and PG Distr...       50953
2  Produced by Charles Aldarondo, Charlie Kirschn...       48061
3  Produced by Christine De Ryck, Stig M. Valstad...       70406
4  Produced by Ted Garvin, Dave Morgan and PG Dis...       13279


In [43]:
from sklearn.metrics import euclidean_distances

In [45]:
# Step 1: Reshape the word_count column to be 2D, as required by euclidean_distances
word_count_2d = df_og['word_count'].values.reshape(-1, 1)

# Step 2: Calculate Euclidean distance matrix based on word count
euclidean_dist_matrix = euclidean_distances(word_count_2d)

# Step 3: Convert Euclidean distance to Euclidean similarity
euclidean_sim_matrix = 1 / (1 + euclidean_dist_matrix)

# Step 4: Convert the Euclidean similarity matrix into a DataFrame for better readability
euclidean_sim_df = pd.DataFrame(euclidean_sim_matrix, index=df_og['title'], columns=df_og['title'])

# Display the Euclidean similarity DataFrame
print(euclidean_sim_df.head())

title                        apocolocyntosis  the house on the borderland  \
title                                                                       
apocolocyntosis                     1.000000                     0.000022   
the house on the borderland         0.000022                     1.000000   
the warriors                        0.000024                     0.000346   
a voyage to the moon                0.000015                     0.000051   
la fiammetta                        0.000130                     0.000027   

title                        the warriors  a voyage to the moon  la fiammetta  \
title                                                                           
apocolocyntosis                  0.000024              0.000015      0.000130   
the house on the borderland      0.000346              0.000051      0.000027   
the warriors                     1.000000              0.000045      0.000029   
a voyage to the moon             0.000045              

In [46]:
# BASED ON wordcount
target_book = 'the adventures of sherlock holmes'

# Rank books by similarity to the target book
similar_books_by_word_count = euclidean_sim_df[target_book].sort_values(ascending=False) 

top_10_books = similar_books_by_word_count[1:11]  # Skip the first one since it's the target book itself

# Print the top 10 books
print(top_10_books)

title
the winning of the west    0.052632
the evolution of man v     0.040000
cara                       0.038462
the reason why             0.020408
mensonges                  0.015625
poetry of wordsworth       0.015152
kim                        0.012500
wild animals               0.011628
a fool and his money       0.011364
love and life              0.011111
Name: the adventures of sherlock holmes, dtype: float64


In [47]:
df['genre_count'] = genre_columns.sum(axis=1)

print(df[['title', 'genre_count']].head())

                         title  genre_count
0              apocolocyntosis           16
1  the house on the borderland           14
2                 the warriors           27
3         a voyage to the moon            3
4                 la fiammetta           13


  df['genre_count'] = genre_columns.sum(axis=1)


In [60]:


# Step 1: Reshape the genre_count column to 2D, as required by euclidean_distances
genre_count_2d = df['genre_count'].values.reshape(-1, 1)

# Step 2: Calculate Euclidean distance matrix based on genre count
euclidean_dist_genre_count = euclidean_distances(genre_count_2d)

# Step 3: Convert Euclidean distance to Euclidean similarity
euclidean_sim_genre_count = 1 / (1 + euclidean_dist_genre_count)

# Step 4: Convert the Euclidean similarity matrix into a DataFrame for readability
euclidean_sim_genre_count_df = pd.DataFrame(euclidean_sim_genre_count, index=df['title'], columns=df['title'])

# Display the Euclidean similarity DataFrame for genre count
print(euclidean_sim_genre_count_df.head())


title                        apocolocyntosis  the house on the borderland  \
title                                                                       
apocolocyntosis                     1.000000                     0.333333   
the house on the borderland         0.333333                     1.000000   
the warriors                        0.083333                     0.071429   
a voyage to the moon                0.071429                     0.083333   
la fiammetta                        0.250000                     0.500000   

title                        the warriors  a voyage to the moon  la fiammetta  \
title                                                                           
apocolocyntosis                  0.083333              0.071429      0.250000   
the house on the borderland      0.071429              0.083333      0.500000   
the warriors                     1.000000              0.040000      0.066667   
a voyage to the moon             0.040000              

In [63]:
# BASED ON Genre Count
target_book = 'dracula'

# Rank books by similarity to the target book
similar_books_by_genre_count = euclidean_sim_genre_count_df[target_book].sort_values(ascending=False) 

top_10_books = similar_books_by_genre_count[2:12]  # Skip the first one since it's the target book itself

# Print the top 10 books
print(top_10_books)

title
harrison                        1.0
the bride                       1.0
the mysterious stranger         1.0
timeline                        1.0
nobody                          1.0
the napoleon of notting hill    1.0
human comedy                    1.0
the white devil                 1.0
twelve men                      0.5
vergil                          0.5
Name: dracula, dtype: float64
