<a href="https://colab.research.google.com/github/faezesarlakifar/SBU-NLPLab-Internship/blob/main/Find_Similar_Articles_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans 

# Read the articles 

In [None]:
X = pd.read_csv(r"../atricles/all-acl-articles-with-content.csv") 
X = X.dropna(thresh=3)  
X = X.dropna(subset = ['Content']) 
X = X.reset_index(drop=True) 

# Preview data

In [None]:
X.head(2000) 

Unnamed: 0,Title,URL,PDF_URL,Authors,Citation,Year of Publish,Content
0,17th Annual Meeting of the Association for Com...,https://aclanthology.org/P79-1000/,https://aclanthology.org/P79-1000.pdf,,0.0,1979.0,b' Front Mat...
1,Syntactic Processing,https://aclanthology.org/P79-1001/,https://aclanthology.org/P79-1001.pdf,,73.0,1979.0,b' Synt...
2,Towards a Self-Extending Parser,https://aclanthology.org/P79-1002/,https://aclanthology.org/P79-1002.pdf,,72.0,1979.0,b' TOWA...
3,Word Expert Parsing,https://aclanthology.org/P79-1003/,https://aclanthology.org/P79-1003.pdf,,13.0,1979.0,b' WORD...
4,Toward a Computational Theory of Speech Percep...,https://aclanthology.org/P79-1005/,https://aclanthology.org/P79-1005.pdf,,0.0,1979.0,b' TOWA...
...,...,...,...,...,...,...,...
1995,Unsupervised Language Model Adaptation Incorpo...,https://aclanthology.org/P07-1085/,https://aclanthology.org/P07-1085.pdf,Yang Liu,0.0,2007.0,b' ...
1996,Coordinate Noun Phrase Disambiguation in a Gen...,https://aclanthology.org/P07-1086/,https://aclanthology.org/P07-1086.pdf,,0.0,2007.0,b' ...
1997,A Unified Tagging Approach to Text Normalization,https://aclanthology.org/P07-1087/,https://aclanthology.org/P07-1087.pdf,"Jie Tang, Hang Li, Hwee Tou Ng, Tiejun Zhao",0.0,2007.0,b' ...
1998,Forest-to-String Statistical Translation Rules,https://aclanthology.org/P07-1089/,https://aclanthology.org/P07-1089.pdf,"Yun Huang, Qun Liu, Shouxun Lin",0.0,2007.0,b' ...


# Data information

In [None]:
X.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8747 entries, 0 to 8746
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            8747 non-null   object 
 1   URL              8747 non-null   object 
 2   PDF_URL          8747 non-null   object 
 3   Authors          7597 non-null   object 
 4   Citation         8747 non-null   float64
 5   Year of Publish  8747 non-null   float64
 6   Content          8747 non-null   object 
dtypes: float64(2), object(5)
memory usage: 478.5+ KB


# TF-IDF calculation

In [None]:
text_content = X['Content'] 
vector = TfidfVectorizer(max_df=0.2,         # drop words that occur in more than X percent of documents
                             #min_df=8,      # only use words that appear at least X times
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,   
                             norm=u'l2',     
                             smooth_idf=True 
                            )
tfidf = vector.fit_transform(text_content) 

In [None]:
def search(tfidf_matrix,model,request, top_n = 5):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    return indices 

def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n]    
 
def print_result(request_content,indices,X):
    print('\nsearch : ' + request_content)
    print('\nBest Results :') 
    for i in indices:
        print('id = {0:5d} - title = {1}'.format(i,X['Title'].loc[i])) 

In [None]:
request = 'Ordering Phrases with Function Words'

result = search(tfidf,vector, request, top_n = 5)
print_result(request,result,X) 


search : Ordering Phrases with Function Words

Best Results :
id =  3619 - title = Dependency-based Pre-ordering for Chinese-English Machine Translation
id =  2926 - title = Sentence Ordering Driven by Local and Global Coherence for Summary Generation
id =  1680 - title = A Bottom-Up Approach to Sentence Ordering for Multi-Document Summarization
id =  2851 - title = Semi-Supervised Modeling for Prenominal Modifier Ordering
id =  3316 - title = Learning to Order Natural Language Texts


In [None]:
print(X['Content'][1308]) 

b'                               A Noisy-Channel Model for Document Compression  Hal Daume\xcc\x81 III and Daniel Marcu Information Sciences Institute  University of Southern California 4676 Admiralty Way  Suite 1001  Marina del Rey  CA 90292\xef\xbf\xbd hdaume marcu \xef\xbf\xbd @isi.edu  Abstract  We present a document compression sys- tem that uses a hierarchical noisy-channel model of text production. Our compres- sion system first automatically derives the syntactic structure of each sentence and the overall discourse structure of the text given as input. The system then uses a sta- tistical hierarchical model of text produc- tion in order to drop non-important syn- tactic and discourse constituents so as to generate coherent  grammatical document compressions of arbitrary length. The sys- tem outperforms both a baseline and a sentence-based compression system that operates by simplifying sequentially all sentences in a text. Our results support the claim that discourse knowledge 

In [None]:
print(X['Title'][1308]) 
print(X['Year of Publish'][1308]) 

A Noisy-Channel Model for Document Compression
2002.0


In [None]:
index = 1308 
result = find_similar(tfidf, index, top_n = 5) 
print_result('1308 - title = A Noisy-Channel Model for Document Compression', result, X)


search : 1308 - title = A Noisy-Channel Model for Document Compression

Best Results :
id =  2968 - title = Text-level Discourse Parsing with Rich Linguistic Features
id =  3090 - title = A Two-step Approach to Sentence Compression of Spoken Utterances
id =  2205 - title = Mining Wikipedia Revision Histories for Improving Sentence Compression
id =  4004 - title = Learning Representations for Text-level Discourse Parsing
id =  4404 - title = Joint Modeling of Content and Discourse Relations in Dialogues
