# LOG6308 : Tp2 - Approche contenu

- Clément Bernard (2096223)
- Ghaith Dekhili (1858454)

## Importations 

In [73]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import os
from sklearn.metrics.pairwise import cosine_similarity

In [95]:
DATA = os.path.join(os.getcwd(), 'data')
TABLE_PATH = os.path.join(DATA, 'citeseer.rtable')
ABSTRACT_PATH = os.path.join(DATA,'abstracts.csv')

## Data 


In [75]:
# Matrice d'adjacence
m = pd.read_table(TABLE_PATH, sep=' ')
# Number of articles
N = m.shape[0]

# Question 1

In [76]:
# TO DO

# Question 2

In [77]:
m

Unnamed: 0,100299,100967,10151,101705,101863,102458,102886,102966,10302,103700,...,96767,97060,97150,9721,97410,97863,98185,99113,9947,9993
100299,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10151,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- Tout d'abord, pour éviter les divisions par 0, on met la diagonale de la matrice à 1

In [78]:
np.fill_diagonal(m.values, 1)

- Ensuite, nous appliquons l'algorithne PageRank

In [79]:
def page_rank(m , d = 0.85, epsilon = 1e-5, max_iter = 10000) : 
    ''' Compute the PageRank score '''
    # Number of articles 
    N = m.shape[0]
    # PageRank values
    r = np.ones((1,N))
    # Initialise the condition 
    new_r = r + 10
    # Iteration 
    iteration = 0
    # Condition : if the mean of difference is greater than epsilon
    condition = abs(r - new_r).mean() > epsilon 
    
    while condition and iteration < max_iter     : 
        # Get the number of output links 
        s = m.sum(axis=0).values.reshape(-1,1)
        # Get the new PageRank vector
        new_r = (1-d)/N +  d * m @ (r.T / s)                
        # Convert into numpy 
        new_r = new_r.values.T
        # Update the condition 
        condition = abs(r - new_r).mean() > epsilon

        r = new_r
        
        iteration+=1
        
    return pd.DataFrame(r.reshape(-1,), index = m.index, columns=['PageRank'])

In [80]:
r = page_rank(m, d= 0.85, epsilon = 1e-5, max_iter = 1000)

In [81]:
r

Unnamed: 0,PageRank
100299,0.001470
100967,0.000447
10151,0.000674
101705,0.001167
101863,0.000289
...,...
97863,0.006034
98185,0.000279
99113,0.002592
9947,0.000675


- Recommendation : prendre les articles maximum de PageRank

In [82]:
def get_recommendation(index, r = r, m=m,N= 10 ): 
    ''' Return recommendations for the index article from PageRank score '''
    # Get the element wise product between the PageRank and adjacent matrix
    idx = m[m.index == index].values * r.values.reshape(1,-1)
    idx = (-idx[0]).argsort()[:N]
    recommend = m.index[idx]
    return list(recommend)

In [83]:
get_recommendation(422908)

[422908, 155792, 3170, 131548, 241538, 17094, 124, 466838, 147460, 149673]

- Variante : etendre le sous-ensemble S aux références de références

In [84]:
m_ = m.values @ m.values + m.values.T

In [85]:
m_[m_ >1] = 1

In [86]:
new_m = m.copy()

In [87]:
m_

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [88]:
new_m[:] = m_

In [89]:
new_m

Unnamed: 0,100299,100967,10151,101705,101863,102458,102886,102966,10302,103700,...,96767,97060,97150,9721,97410,97863,98185,99113,9947,9993
100299,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100967,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10151,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101705,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101863,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97863,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
98185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [90]:
new_r = page_rank(new_m, d= 0.85, epsilon = 1e-5, max_iter = 1000)

In [91]:
def get_recommendation_new(index, r = new_r, m=new_m,N= 10 ): 
    ''' Return recommendations for the index article from PageRank score '''
    # Get the element wise product between the PageRank and adjacent matrix
    idx = m[m.index == index].values * r.values.reshape(1,-1)
    idx = (-idx[0]).argsort()[:N]
    recommend = m.index[idx]
    return list(recommend)

In [92]:
get_recommendation_new(422908)

[311874, 19422, 422908, 83263, 464834, 297641, 366858, 177173, 17507, 226359]

# Question 3

- On considère pour cette question qu'un utilisateur est représenté par son vecteur dans la matrice d'adjacence
- On calcule ensuite la distance cosinus pour chaque utilisateur 
- La première recommendation sera forcément l'utilisateur lui-même 

In [300]:
def get_recommendation_sim(m = m , N=10) :
    ''' Return the top N recommendation using the cosine similarity '''
    # Get the similarity for each user 
    df_sim = pd.DataFrame(cosine_similarity(m,m), index = m.index, columns = m.columns)
    # Get the indexes that maximises the cosine similarity
    idx = (-df_sim.values).argsort()
    # Keep the N max
    idx = idx[:,:N]
    # Convert the indexes into article indexes
    article_idx = np.zeros(idx.shape)
    for i,index in enumerate(idx) : 
        for j,index_ in enumerate(index) : 
            article_idx[i,j] = m.index[index_]
    return pd.DataFrame(article_idx, index = m.index)

In [301]:
df_sim = get_recommendation_sim(N=10)

In [303]:
df_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
100299,100299.0,527057.0,170577.0,66991.0,25937.0,164141.0,93456.0,27921.0,120053.0,5101.0
100967,100967.0,281158.0,282066.0,517603.0,20858.0,549091.0,185918.0,275211.0,298007.0,272596.0
10151,10151.0,45228.0,93250.0,130506.0,189479.0,394586.0,3950.0,395364.0,40057.0,396568.0
101705,101705.0,74362.0,42779.0,114285.0,88318.0,402497.0,40057.0,40014.0,40009.0,399339.0
101863,101863.0,77030.0,134503.0,223844.0,224357.0,66776.0,145363.0,258875.0,135666.0,125089.0
...,...,...,...,...,...,...,...,...,...,...
97863,97863.0,289801.0,171813.0,295968.0,170200.0,503616.0,342910.0,520768.0,284612.0,8069.0
98185,98185.0,136975.0,124.0,83730.0,147460.0,395364.0,17507.0,1347.0,422908.0,373307.0
99113,99113.0,150689.0,226964.0,76418.0,236799.0,248936.0,3950.0,395364.0,396568.0,403583.0
9947,9947.0,31170.0,77621.0,139321.0,16117.0,97150.0,50568.0,79248.0,42422.0,340170.0


- 10 meilleures recommendations de 422908

In [304]:
df_sim[df_sim.index == 422908]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
422908,422908.0,497542.0,96767.0,147460.0,466838.0,149673.0,155792.0,70445.0,124.0,3170.0


# Question 4

In [97]:
abstract = pd.read_csv(ABSTRACT_PATH)

In [98]:
abstract

Unnamed: 0.1,Unnamed: 0,Id,Titre,Auteurs,Description
0,1,124,Hybrid Automata: An Algorithmic Approach to th...,Costas Courcoubetis; Pei-hsin Ho; Rajeev Alur;...,We introduce the framework of hybrid automata ...
1,2,496,Representing Action: Indeterminacy and Ramific...,Enrico Giunchiglia; G. Neelakantan Kartha; Vla...,We define and study a high-level language for ...
2,3,712,A Decision Procedure for a Temporal Belief Logic,Michael Fisher; Michael Wooldridge;,. This paper presents a temporal belief logic ...
3,4,1186,A Comparative Evaluation of Sequential Feature...,David W. Aha; Richard L. Bankert;,Several recent machine learning publications d...
4,5,1347,Extending Promela and Spin for Real Time,Costas Courcoubetis; Stavros Tripakis;,The efficient representation and manipulation ...
...,...,...,...,...,...
1085,1086,548670,Null,,
1086,1087,549091,Null,,
1087,1088,561317,Null,,
1088,1089,570764,Null,,


- On ne considère que les descriptions (peut être à modifier)

In [117]:
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/clementbernard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/clementbernard/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [138]:
def get_vocab_and_lemmatize(sentences = abstract['Description']) : 
    ''' Return the vocabulary of the sentences, and the lemmatisated sentences  '''
    vocab = set()
    new_sentences = []
    lemmatizer = WordNetLemmatizer() 

    for sentence in sentences : 
        n_sentence = []
        try : 
            for word in  word_tokenize(sentence) : 
                lem_word = lemmatizer.lemmatize(word)
                n_sentence.append(lem_word)
                vocab.add(lem_word)
        except : 
            pass
        
        new_sentences.append(n_sentence)
    return vocab, new_sentences

In [139]:
vocab, new_sentences = get_vocab_and_lemmatize()

In [216]:
def get_df(vocab = vocab, new_sentences = new_sentences) : 
    ''' Return the df : number of documents that contain the given word '''
    df_vocab = { x : 0 for x in list(vocab)}
    # Loop over the vocabulary
    for word in df_vocab : 
        # Loop over the sentences
        for sentence in new_sentences :
            # If the word is in the document 
            if word in sentence : 
                # Increment the counter
                df_vocab[word]+=1
                # Pass to the next document
                break
    return df_vocab

In [217]:
df_vocab = get_df()

In [237]:
def get_term_document_matrix(vocab = vocab, new_sentences = new_sentences, abstract = abstract, df_vocab = df_vocab) : 
    ''' Return the term document matrix with TF-IDF '''
    # Size of the vocabulary
    n_vocab = len(vocab)
    # Number of documents 
    n_document = abstract.shape[0]
    # Frequency matrix 
    tf_matrix = pd.DataFrame(np.zeros((n_vocab, n_document)), index = vocab, columns = abstract.index)
    # Df matrix 
    df_matrix = tf_matrix.copy()
    # Loop over the sentences 
    for i,sentence in enumerate(new_sentences) : 
        # Loop over the words 
        for word in sentence :
            # Get the index of the word in the matrix 
            i_word = (tf_matrix.index == word)
            # Compute the frequency 
            tf_matrix.iloc[i_word, i] += 1
            # Compute the inverse frequency 
            df_matrix.iloc[i_word,i] = np.log(n_document / df_vocab[word])
        # Normalise the frequency 
        tf_matrix.iloc[:,i]/=len(sentence)
    
    matrix = df_matrix * tf_matrix
    return matrix.fillna(0)
# return  df_matrix, tf_matrix

In [239]:
term_document = get_term_document_matrix()

In [260]:
def convert_columns(matrix, abstract = abstract) : 
    ''' Convert the column to the following id '''
    new_matrix = matrix.copy()
    new_matrix.columns = abstract['Id'].values
    return new_matrix

In [265]:
# Convert the columns name 
term_document = convert_columns(term_document)

In [264]:
term_document

Unnamed: 0,124,496,712,1186,1347,1948,2104,2111,2490,3156,...,543817,545708,547939,548310,548620,548670,549091,561317,570764,573595
courier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
shortage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
atre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
evolved,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
decline,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Veloso,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
twenty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hearing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
def get_recommendation_term_document(m = term_document , N=10) :
    ''' Return the top N recommendation using the cosine similarity '''
    # Get the similarity for each user 
    df_sim = pd.DataFrame(cosine_similarity(m.T,m.T), index = m.columns, columns = m.columns)
    # Get the indexes that maximises the cosine similarity
    idx = (-df_sim.values).argsort()
    # Keep the N max
    idx = idx[:,:N]
    # Convert the indexes into article indexes
    article_idx = np.zeros(idx.shape)
    for i,index in enumerate(idx) : 
        for j,index_ in enumerate(index) : 
            article_idx[i,j] = m.columns[index_]
    return pd.DataFrame(article_idx, index = m.columns)

In [308]:
df_sim_term_document = get_recommendation_term_document(m =term_document)

- 10 recommendations de l'article 422908

In [309]:
df_sim_term_document[df_sim_term_document.index == 422908]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
422908,422908.0,335678.0,96767.0,6497.0,53595.0,295968.0,322240.0,10563.0,296098.0,120172.0


# Question 5

In [28]:
# TO DO