Dans ce notebook, nous avons repris le programme du TF-IDF de notre encardrant pour ensuite créer notre liste de mots informatifs et une nouvelle matrice TF-IDF utilisant ces mots. Nous avons ensuite normalisé les vecteurs pour afficher dans un graphique les différents clusters.

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
data = pd.read_csv("./arxiv_articles.csv", sep="|")

In [3]:
data.loc[0:5, :]

Unnamed: 0,id,title,authors,arxiv_primary_category,summary,published,updated
0,http://arxiv.org/abs/2001.05867v1,$σ$-Lacunary actions of Polish groups,Jan Grebik,math.LO,We show that every essentially countable orbit...,2020-01-16T15:09:02Z,2020-01-16T15:09:02Z
1,http://arxiv.org/abs/1303.6933v1,Hans Grauert (1930-2011),Alan Huckleberry,math.HO,Hans Grauert died in September of 2011. This a...,2013-03-27T19:23:57Z,2013-03-27T19:23:57Z
2,http://arxiv.org/abs/1407.3775v1,A New Proof of Stirling's Formula,Thorsten Neuschel,math.HO,A new simple proof of Stirling's formula via t...,2014-07-10T11:26:39Z,2014-07-10T11:26:39Z
3,http://arxiv.org/abs/math/0307381v3,On Dequantization of Fedosov's Deformation Qua...,Alexander V. Karabegov,math.QA,To each natural deformation quantization on a ...,2003-07-30T06:20:33Z,2003-09-20T01:29:18Z
4,http://arxiv.org/abs/1604.06794v1,Cyclic extensions are radical,Mariano Suárez-Álvarez,math.HO,We show that finite Galois extensions with cyc...,2016-04-21T22:24:54Z,2016-04-21T22:24:54Z
5,http://arxiv.org/abs/1712.09576v2,The Second Main Theorem in the hyperbolic case,Min Ru;Nessim Sibony,math.CV,We develop Nevanlinna's theory for a class of ...,2017-12-27T13:17:08Z,2019-01-03T07:51:11Z


In [4]:
def naif_regex_tokenize(text):
    """
    This is a very naif way of tokenize a text. Just using the
    regular expression "[a-z]" that will match any single word
    in lowercase.
    Returns a list with all the tokens.
    """
    p = re.compile("[a-z]+")
    return p.findall(text.lower())

def compute_tf(d):
    """
    Compute the tf for a given document d.
    The formula used is 
    
        tf(t, d) = 0.5 + 0.5 * (count(t, d)/max(count(t',d) for t' in d))
    
    This prevents bias in longer documents.
    """
    terms = pd.Series(naif_regex_tokenize(d)) #tableau de tous les termes du doc
    term_counts = terms.value_counts()  #série contenant le 
    max_tc = max(term_counts)
    return 0.5 + 0.5 * (term_counts / max_tc)

def compute_idf(D):
    """
    The input D is a list of pandas.Series
    having as each element, the term frequency 
    computed by the function compute_tf.
    """
    N = len(D)
    all_terms = pd.concat(D)
    nt = all_terms.index.value_counts() # The number of documents containing the term "t"
    return np.log(N / nt)

def compute_tf_idf_document(tf_document, idf):
    """Compute the tf-idf for each term in a document of the corpus

    Keyword arguments:
    tf_document -- list with the frequency of each term inside the document
    idf -- the idf value for each term in the corpus
    """
    return tf_document * np.array([idf[i] for i in tf_document.index])
    
def compute_tf_idf_corpus(D):
    """Compute the tf-idf for each term in a corpus

    Keyword arguments:
    D -- pandas Series containing a collection of documents in text format
    
    returns
        list of pandas Series containing the tf-idf(t, d, D) for each term
        inside each document of the corpus D
    """
    term_freq = [compute_tf(d) for d in D]
    idf = compute_idf(term_freq)
    return [compute_tf_idf_document(d, idf) for d in term_freq]


In [5]:
tf_idf = compute_tf_idf_corpus(data.loc[:1000, "summary"])

In [6]:
tf_idf

[a                  0.271497
 of                 0.067201
 essentially        4.477371
 that               0.684150
 polish             5.438657
 is                 0.543284
 action             2.934767
 every              2.582264
 by                 0.958157
 we                 0.178491
 induced            3.317886
 relation           2.829945
 equivalence        2.737994
 math               2.863284
 countable          3.974488
 group              1.629417
 hyperfinite        4.317972
 archimedean        3.631339
 with               0.635694
 lacunary           4.317972
 obtain             1.996989
 from               1.198951
 non                1.556196
 show               1.072374
 an                 0.736659
 result             1.491854
 in                 0.290390
 on                 0.581168
 continuous         2.192223
 abelian            2.386070
 invent             4.317972
 orbit              3.312073
 proof              1.571441
 sigma              3.018321
 combination  

At this stage we have the tf-idf values at each document. In order to select the *most important* terms (i.e. the terms with higher tf-idf values), we compute the **mean** of the tf-idf for each term.

In [7]:
range(len(tf_idf[0]))
range(len(tf_idf[0].index))
#tf_idf[0].index[0]

range(0, 39)

In [8]:
all_terms = pd.concat(tf_idf)
print(all_terms)

a              0.271497
of             0.067201
essentially    4.477371
that           0.684150
polish         5.438657
                 ...   
preserves      2.923997
constant       2.208133
small          1.825819
parameter      2.208133
effects        3.028181
Length: 42629, dtype: float64


In [9]:
mean_tf_idf = all_terms.groupby(all_terms.index).mean()
print(mean_tf_idf)

a               0.208555
aaai            3.886175
aamas           3.886175
abbondandolo    4.030107
abelian         2.329956
                  ...   
zone            3.577748
zones           3.577748
zrapvrdl        3.947860
zurich          4.357607
zwiebach        3.838197
Length: 6921, dtype: float64


In [10]:
sorted_tf_idf = mean_tf_idf.sort_values(ascending=False)

In [11]:
sorted_tf_idf[:1000]

psl             6.908755
threefolds      6.908755
thick           6.908755
koras           6.908755
puzzles         6.908755
                  ...   
orderability    4.605837
atze            4.605837
beilinson       4.605837
monomial        4.605837
periodicity     4.605837
Length: 1000, dtype: float64

In [12]:
sorted_tf_idf[:100].index[0]

'psl'

In [15]:
## on crée une nouvelle liste de mots informatifs selon 2 critères
def appartient(texte, mot): # renvoie True si le mot en paramètre appartient à la liste
    liste = naif_regex_tokenize(texte)
    app = False
    for m in liste:
        if mot == m:
            app = True
            break
    return app
    
def new_termes(liste_mots, corpus):    ## liste des mots informatifs
    new_list = []
    for mot in liste_mots:
        n = 0
        for texte in corpus:
            if appartient(texte, mot):
                n += 1
        if n >= 10:
            new_list += [mot]
    return new_list
            
new_liste_mot = new_termes(sorted_tf_idf[:10000].index, data.loc[:10000, "summary"])    


In [16]:
## on calcule le tf_idf pour chaque document
def new_tfidf(new_liste_mots, tf_idf):
    tab = np.zeros([len(tf_idf), len(new_liste_mots)])
    for indice_texte, texte in enumerate(tf_idf):
        for mot in texte.index:
            if mot in new_liste_mots:
                tab[indice_texte, new_liste_mots.index(mot)] = texte[mot]
    return tab

tabl = new_tfidf(new_liste_mot, tf_idf)
tabl

array([[0.        , 0.        , 0.        , ..., 0.17849134, 0.08052054,
        0.06720106],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.05760091],
       [0.        , 0.        , 0.        , ..., 0.        , 0.12883287,
        0.05760091],
       ...,
       [0.        , 0.        , 0.        , ..., 0.14279308, 0.12883287,
        0.07680121],
       [0.        , 0.        , 0.        , ..., 0.16999176, 0.12883287,
        0.04937221],
       [0.        , 0.        , 0.        , ..., 0.15299258, 0.12883287,
        0.06034381]])

In [17]:
## on normalise les vecteurs
def normaliser_vecteur(vect):
    for indice, valeur in enumerate(vect):
        if valeur != 0:
            vect[indice] = valeur / np.linalg.norm(valeur)
            
def normaliser_tout(matrice):
    for liste in matrice:
        normaliser_vecteur(liste)

normaliser_tout(tabl)
        
tabl

array([[0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [18]:
## clusters k-means

import plotly.express as px

def distance(v1, v2):
    """
    Compute the distance between v1 and v2
    v1 and v2 are numpy arrays
    """
    return np.sqrt(np.sum((v1-v2)**2))

def assign(vectors, centers):
    """
    assign each vector to the closest center.
    vectors is a numpy matrix. We want to assign each
    row to the closest center.
    centers is a numpy matrix. Each row has a center
    
    returns a list of integers. 
    One value for each vector indicaing the closest center
    """
    groups = np.zeros(vectors.shape[0])
    for i in range(len(groups)):
        groups[i] = np.argmin(np.apply_along_axis(distance, 1, centers, vectors[i]))
    return groups

def compute_centers(vectors, groups):
    """
    Compute the centers for each group of 
    vectors
    vectors is a numpy matrix
    groups is a list containing the assignments
    of the vectors
    """
    new_centers = np.zeros([int(max(groups)) + 1, vectors.shape[1]])
    for i in range(int(max(groups)) + 1):
        ix = np.where(groups==i)[0]
        grp_members = vectors[ix, :]
        new_centers[i] = grp_members.mean(0)
    return new_centers

def choose_first_centers(vectors, k):
    """
    Select the first k centers for the begining of the
    k-means algorithm
    """
    ix = np.arange(0, vectors.shape[0])
    np.random.shuffle(ix)
    return vectors[ix[:k], :]

def kmeans(vectors, k, max_iterations = 500):
    """
    Naive implementation of k-means algorithm
    """
    centers_list = []
    centers = choose_first_centers(vectors, k)
    centers_list.append(centers)
    groups = assign(vectors, centers)
    new_centers = compute_centers(vectors, groups)
    centers_list.append(new_centers)
    nb_iter = 0
    while (np.sum(np.abs(centers - new_centers)) > 0) or (nb_iter > max_iterations):
        centers = np.copy(new_centers)
        groups = assign(vectors, centers)
        new_centers = compute_centers(vectors, groups)
        centers_list.append(new_centers)
        nb_iter += 1
    return new_centers, centers_list

In [19]:
vectors = tabl   #on utilise les vecteurs créés pour chaque texte
centers = np.random.random([7, 5])

In [22]:
centers, centers_list = kmeans(vectors, 5)

In [23]:
centers

array([[0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.96788991, 0.90366972,
        0.95412844],
       [0.0027027 , 0.0027027 , 0.0027027 , ..., 0.71351351, 0.78378378,
        0.85945946],
       [0.        , 0.        , 0.        , ..., 0.8254717 , 0.97169811,
        0.96698113],
       [0.        , 0.        , 0.        , ..., 0.69      , 0.93      ,
        0.975     ]])

In [24]:
df = pd.DataFrame({})
for i in range(len(centers_list)):
    v = {"x":vectors[:, 0], "y":vectors[:, 1], "p_type":["data_point"]*vectors.shape[0], 
    "iteration":[i]*vectors.shape[0]}
    df = pd.concat([df, pd.DataFrame(v)])
    c = {"x":centers_list[i][:, 0], "y":centers_list[i][:, 1], "p_type":["center"]*centers_list[0].shape[0],
    "iteration":[i]*centers_list[0].shape[0]}
    df = pd.concat([df, pd.DataFrame(c)])

In [28]:
px.scatter(df, x="x", y="y", animation_frame="iteration", color="p_type")