This notebook is used to import the data and pre-process it.

# Importation

In [1]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from gutenberg.query import get_metadata
from tqdm import tqdm
import pandas as pd
import numpy as np
import json 
import pickle
import string
from langdetect import detect_langs

import matplotlib.pyplot as plt
from Book import Book
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
#TODO change here the path to your data folder where you stored the csv 'books_and_genres.csv'
data_path='./data'

# Datasets

## Gutenberg module (too long to load, not used)

### Test

In [3]:
#A simple test to see if the module Gutenberg is well installed 
try:
    load_etext(1)
except:
    print("WARNING Review your installation of the Gutenberg module, you might need to change the mirror used. See README.")

### Loading books and their metadata

In [4]:
books={}
for i in tqdm(range(10)):
    try:
        book=strip_headers(load_etext(i)).strip()
        #author,=get_metadata('author', i)
        #title, =get_metadata('title', i)
        #genres=get_metadata('subject', i)
    except:
        continue
    books[i]={}
    books[i]['text']=book
    #books[i]['title']=title
    #books[i]['author']=author
   # books[i]['genres']=genres  #TODO clean this list

100%|██████████| 10/10 [00:00<00:00, 105.28it/s]


In [5]:
print(f'{len(books)} books were downloaded')

9 books were downloaded


## 10 000 e-books

In [6]:
raw_data = pd.read_csv(data_path+'/books_and_genres.csv')

In [14]:
#removing nan values
books_df=raw_data.dropna()

In [15]:
#removing non english books
def detect_language(text): 
    test_text=text[:min(500, len(text))]
    try: 
        langs = detect_langs(test_text) 
        for item in langs: 
            # The first one returned is usually the one that has the highest probability
            return item.lang
    except: return "err"
language=books_df.text.apply(detect_language)
books_df=books_df[language=='en'].reset_index(drop=True)

In [16]:
books_df = books_df.iloc[: , 1:]

In [17]:
books_df.head()

Unnamed: 0,title,text,genres
0,apocolocyntosis,"Produced by Ted Garvin, Ben Courtney and PG Di...","{'21st-century', 'history', 'roman', 'classics..."
1,the house on the borderland,"Produced by Suzanne Shell, Sjaani and PG Distr...","{'horror', 'mystery', 'classics', 'science-fic..."
2,the warriors,"Produced by Charles Aldarondo, Charlie Kirschn...","{'literary-fiction', 'history', 'biography', '..."
3,a voyage to the moon,"Produced by Christine De Ryck, Stig M. Valstad...","{'20th-century', 'science-fiction', 'speculati..."
4,la fiammetta,"Produced by Ted Garvin, Dave Morgan and PG Dis...","{'literary-fiction', 'history', 'feminism', 'c..."


In [19]:
genres_dict = {}
genres_count={}
new_genres=[]
i=0
for genres_str in books_df.genres:
    genres_list = genres_str[1:len(genres_str)-1].split(', ')
    for genre in genres_list:
        if genre not in genres_dict:
            genres_dict[genre]=i
            genres_count[genre]=1
            i+=1
        else:
            genres_count[genre]+=1
    new_genres.append([genres_dict[g] for g in genres_list])

In [20]:
books_df['idx_genres']=pd.Series(new_genres)

## Books to chapters

In [21]:
all_chapters=[]
for i, row in tqdm(books_df.iterrows()):
    b=Book(row['text'])
    chap=b.rebuild_chapters()
    chap_df=pd.DataFrame({'chap_text':chap})
    chap_df['title']=row['title']
    all_chapters.append(chap_df)

9500it [02:42, 58.34it/s] 


In [22]:
all_chapters_df=pd.concat(all_chapters)

In [23]:
all_chapters_df.head(10)

Unnamed: 0,chap_text,title
0,"your own if you will be fair.) Come tell me, b...",apocolocyntosis
0,THE CHURCH OF GOD IV. THE WORLD-MARCH: ...,the warriors
1,[CUTLER] _The Son of God goes forth t...,the warriors
2,[VOX DILECTI] _I heard the voice of J...,the warriors
3,[AURELIA] _The Church's one foundatio...,the warriors
4,[DIE WACHT AM RHEIN] _Jesus shall rei...,the warriors
5,"[LYONS] _O Majesty throned, O Lord of...",the warriors
6,[ADESTE FIDELES] _Our Father in Heave...,the warriors
7,"[AMSTERDAM] _Lo, my soul, look forth ...",the warriors
8,"[ARMAGEDON] Jesus, Thou hast bought u...",the warriors


# Pre-processing

In [24]:
### Preprocessing functions
def stemming(x):
    ps = PorterStemmer() # Initialize Python porter stemmer
    return [ps.stem(w) for w in x]

def lemmatization(x):
    wnl = WordNetLemmatizer() # Initialize wordnet lemmatizer
    return [wnl.lemmatize(w) for w in x]

def stop_words(x):
    stop_words = set(stopwords.words('english'))
    return [w for w in x if not w.lower() in stop_words]

In [25]:
def preprocessing(X, rm_stop, stem, lem):
    X_new=[]
    for doc in tqdm(X):
        new_doc=word_tokenize(doc.lower().translate(str.maketrans("", "", string.punctuation)))
        if rm_stop:
            new_doc=stop_words(new_doc)
        if stem:
            new_doc=stemming(new_doc)
        if lem:
            new_doc=lemmatization(new_doc)
        X_new.append(new_doc)    
    return X_new

## Genres database

In [26]:
mlb = MultiLabelBinarizer()
df=books_df.loc[:1000, :]
df.to_csv(data_path+'/books_clean.csv', index=False)
X,y=df.text.tolist(), pd.DataFrame(mlb.fit_transform(df.idx_genres),columns=mlb.classes_, index=df.index)

In [27]:
X_new=preprocessing(X, True, True, False)

  0%|          | 0/1001 [00:00<?, ?it/s]

100%|██████████| 1001/1001 [19:59<00:00,  1.20s/it]


## Chapters database

In [28]:
chapters_df=all_chapters_df.sample(10000)
chapters_df.to_csv(data_path+'/chapters_clean.csv', index=False)

In [29]:
X2,y2=chapters_df.chap_text.tolist(), chapters_df.title.tolist()

In [30]:
X2_new=preprocessing(X2, True, True, False)

  0%|          | 2/10000 [00:00<11:35, 14.38it/s]

100%|██████████| 10000/10000 [09:10<00:00, 18.15it/s] 


# Exportation

In [31]:
with open("./data/dict_genres.json", "w") as outfile: 
    json.dump(genres_dict, outfile) #will be useful to remap genres names to the ids later
    
with open("./data/genres_counts.json", "w") as outfile: 
    json.dump(genres_count, outfile) #will be useful to analyze the data

In [34]:
#Exporting X and y (genre prediction)
with open('./data/X.pkl', 'wb') as f:
    pickle.dump(X_new, f)

with open('./data/y.pkl', 'wb') as f:
    pickle.dump(y, f)

In [69]:
#Exporting X and y (chapter clustering)
with open('./data/X2.pkl', 'wb') as f:
    pickle.dump(X2_new, f)

with open('./data/y2.pkl', 'wb') as f:
    pickle.dump(y2, f)