In [1]:
import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [12]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
df = pd.read_csv('booksummaries.txt', sep='\t', header=None)

In [14]:
# only book title and description
df = df[[2, 6]]
# rename
df = df.rename(columns={2: "Title", 6: "Description"})

In [15]:
# check if any entry is null

df['Title'].isnull().sum(), df['Description'].isnull().sum()

(0, 0)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16559 entries, 0 to 16558
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        16559 non-null  object
 1   Description  16559 non-null  object
dtypes: object(2)
memory usage: 258.9+ KB


In [17]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-z\s]','',text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [10]:
preprocess_text(df['Description'][0])

'old major old boar manor farm call animal farm meeting compare human parasite teach animal revolutionary song beast england major dy two young pig snowball napoleon assume command turn dream philosophy animal revolt drive drunken irresponsible mr jones farm renaming animal farm adopt seven commandment animalism important animal equal snowball attempt teach animal reading writing food plentiful farm run smoothly pig elevate position leadership set aside special food item ostensibly personal health napoleon take pup farm dog train privately napoleon snowball struggle leadership snowball announces plan build windmill napoleon dog chase snowball away declares leader napoleon enacts change governance structure farm replacing meeting committee pig run farm using young pig named squealer mouthpiece napoleon claim credit windmill idea animal work harder promise easier life windmill violent storm animal find windmill annihilated napoleon squealer convince animal snowball destroyed although sco

In [11]:
# preprocess description
df['Description'] = df['Description'].progress_apply(preprocess_text)

100%|███████████████████████████████████████████████████████████████████████████████| 16559/16559 [00:39<00:00, 422.27it/s]


0        old major old boar manor farm call animal farm...
1        alex teenager living nearfuture england lead g...
2        text plague divided five part town oran thousa...
3        argument enquiry proceeds series incremental s...
4        novel posit space around milky way divided con...
                               ...                        
16554    prue mckeel rescued brother dowager governess ...
16555    reader first meet rapp covert operation iran d...
16556    book follows rough chronological order switchi...
16557    colbert address topic including wall street ca...
16558    makar devushkin varvara dobroselova second cou...
Name: Description, Length: 16559, dtype: object

In [56]:
tfidf = TfidfVectorizer(stop_words='english')

In [57]:
tfidf_matrix = tfidf.fit_transform(df['Description'])

In [58]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2348080 stored elements and shape (16559, 138566)>

In [59]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [76]:
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

In [65]:
cosine_sim[0]

array([1.        , 0.01350778, 0.01261944, ..., 0.00900731, 0.00431475,
       0.01189427])

In [69]:
sim_score = list(enumerate(cosine_sim[0]))

In [71]:
sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)

In [91]:
indices.keys()[0]

'Animal Farm'

In [81]:
indices["Snowball's Chance"]

9989

In [19]:
from atlas import Atlas

In [20]:
atlas = Atlas().train(df)

In [21]:
atlas.save()

Model saved to book_similarity_model
