This notebook is for the chapter clustering task.

# Importation

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

In [2]:
#TODO change here the path to your data folder
data_path='./data'

In [3]:
#Importing the X and y
with open(data_path+'/X2.pkl', 'rb') as f:
    X=pickle.load(f)

with open(data_path+'/y2.pkl', 'rb') as f:
    y=pickle.load(f)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Doc2Vec

In [5]:
# tagged data: each doc number is a label
tagged_data = [TaggedDocument(words=doc,
                              tags=[str(i)]) for i, doc in enumerate(X_train)]
# train the Doc2vec model
model = Doc2Vec(vector_size=20,
                min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)

In [6]:
# get the document vectors
X_train_vec = [model.infer_vector(doc) for doc in X_train]
X_test_vec = [model.infer_vector(doc) for doc in X_test]

# Model

In [8]:
kmeans = KMeans(n_clusters=4000, random_state=0, n_init="auto").fit(X_train_vec)

In [9]:
clusters=kmeans.predict(X_train_vec)

In [16]:
clusters_t=kmeans.predict(X_test_vec)

# Results

### Training set

In [11]:
res={}
for i, title in enumerate(y_train):
    if title not in res:
        res[title]=0
    res[title]+=np.sum(X_train_vec[i]**2)
results_doc=pd.DataFrame.from_dict(res, orient='index').reset_index()
results_doc.columns=['title', 'dist_doc2vec']
results_doc.dist_doc2vec=np.sqrt(results_doc.dist_doc2vec)

In [13]:
results=pd.DataFrame({'cluster':clusters, 'title':y_train}).groupby('title').agg([pd.Series.count, pd.Series.nunique])
results.columns=['nb_chapters', 'nb_clusters']
results=results.merge(results_doc, on='title')
results.sort_values('nb_chapters', ascending=False).head(10)

Unnamed: 0,title,nb_chapters,nb_clusters,dist_doc2vec
3185,the poetical works of thomas hood,53,10,35.944016
3840,works of robert burns,51,9,46.237261
1840,os lusadas,40,2,19.397831
1082,historical novels,34,29,133.120058
2355,the aeneid of virgil,24,7,29.269658
3769,websters unabridged dictionary,21,13,130.40642
1939,plutarchs lives volume ii,18,12,53.183596
511,childrens literature,18,17,54.576898
1940,plutarchs lives volume iii,18,8,52.526703
530,ciceros tusculan disputations,16,11,52.030605


In [14]:
results['nb_clusters'].describe()

count    3860.000000
mean        1.803109
std         1.348784
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        29.000000
Name: nb_clusters, dtype: float64

In [15]:
results['dist_doc2vec'].describe()

count    3860.000000
mean       26.026915
std        11.466998
min         1.272924
25%        18.588702
50%        24.000248
75%        31.859452
max       133.120058
Name: dist_doc2vec, dtype: float64

### Test set

In [17]:
res_t={}
for i, title in enumerate(y_test):
    if title not in res_t:
        res_t[title]=0
    res_t[title]+=np.sum(X_test_vec[i]**2)
results_doc_t=pd.DataFrame.from_dict(res_t, orient='index').reset_index()
results_doc_t.columns=['title', 'dist_doc2vec']
results_doc_t.dist_doc2vec=np.sqrt(results_doc_t.dist_doc2vec)

In [18]:
results2=pd.DataFrame({'cluster':clusters_t, 'title':y_test}).groupby('title').agg([pd.Series.count, pd.Series.nunique])
results2.columns=['nb_chapters', 'nb_clusters']
results2=results2.merge(results_doc_t, on='title')
results2.sort_values('nb_chapters', ascending=False).head(10)

Unnamed: 0,title,nb_chapters,nb_clusters,dist_doc2vec
1244,the poetical works of thomas hood,14,4,14.051041
1503,works of robert burns,14,8,22.735657
1479,websters unabridged dictionary,10,5,61.323238
729,os lusadas,10,1,8.581459
923,the aeneid of virgil,9,5,20.930494
418,historical novels,8,7,44.272198
774,plutarchs lives volume iii,7,4,26.33553
935,the antiquities of the jews,7,7,53.150846
201,childe harolds pilgrimage,7,4,14.120943
370,good sense,6,4,22.149982


In [19]:
results2['nb_clusters'].describe()

count    1509.000000
mean        1.242545
std         0.614388
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: nb_clusters, dtype: float64

In [20]:
results2['dist_doc2vec'].describe()

count    1509.000000
mean       15.795216
std         6.466926
min         1.173207
25%        11.875908
50%        14.767870
75%        18.512836
max        61.323238
Name: dist_doc2vec, dtype: float64