### Crawl data to create a corpus

In [1]:
import requests
from bs4 import BeautifulSoup

# Make a request to the website
r = requests.get('https://bola.kompas.com/')

# Create an object to parse the HTML format
soup = BeautifulSoup(r.content, 'html.parser')

# Retrieve all popular news links (Fig. 1)
link = []

for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])
# For each link, we retrieve paragraphs from it, combine each paragraph as one string, and save it to documents (Fig. 2)
documents = []
for i in link:
    # Make a request to the link
    r = requests.get(i)
  
    # Initialize BeautifulSoup object to parse the content 
    soup = BeautifulSoup(r.content, 'html.parser')
  
    # Retrieve all paragraphs and combine it as one
    sen = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        sen.append(i.text)
  
    # Add the combined paragraphs to documents
    documents.append(' '.join(sen))

In [2]:
type(documents)

list

In [3]:
len(documents)

10

In [4]:
documents  # is equivalent to vulnerability['description']

['KOMPAS.com -\xa0Jutaan pendukung Barcelona patah hati melihat Lionel Messi meninggalkan Camp Nou, tak terkecuali Andres Iniesta. Andres Iniesta yang juga merupakan legenda Barcelona itu mengaku tidak bisa membayangkan melihat Messi bermain untuk klub selain Barcelona. “Akan menyakitkan melihatnya mengenakan seragam tim lain. Leo melambangkan Barcelona," ucap Iniesta kepada AFP, sebagaimana dikutip dari Goal, Rabu (11/8/2021). "Dia adalah segalanya, dia pemain yang \'melampaui\' klub. Saya belum pernah melihat pemain seperti dia dan saya pikir tidak akan pernah melihatnya (lagi)," imbuhnya. Andres Iniesta mengaku tidak tahu ada apa di balik kepergian Messi ke Paris Saint-Germain (PSG), tetapi dia berharap Barcelona tetap "hidup" setelah ditinggal sang kapten. Baca juga: RESMI - Lionel Messi Bergabung dengan PSG "Saya tidak tahu apa yang terjadi secara internal, atau bagaimana keadaannya, tetapi klub perlu pulih dari transfer ini," kata Iniesta. Dapatkan informasi, inspirasi dan insigh

In [5]:
import re
import string
documents_clean = []
for d in documents:
    # Remove Unicode
    document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
    # Remove Mentions
    document_test = re.sub(r'@\w+', '', document_test)
    # Lowercase the document
    document_test = document_test.lower()
    # Remove punctuations
    document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
    # Lowercase the numbers
    document_test = re.sub(r'[0-9]', '', document_test)
    # Remove the doubled space
    document_test = re.sub(r'\s{2,}', ' ', document_test)
    documents_clean.append(document_test)

In [6]:
type(documents_clean)

list

In [7]:
len(documents_clean)

10

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer(ngram_range = (1,1))

# It fits the data and transform it as a vector
X = vectorizer.fit_transform(documents_clean)

# Convert the X as transposed matrix
X = X.T.toarray()

# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index = vectorizer.get_feature_names())

In [12]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
abel,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039382
ac,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.072913,0.000000,0.000000,0.000000
achraf,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.145825,0.000000,0.000000,0.000000
ada,0.029109,0.000000,0.000000,0.026858,0.022780,0.018889,0.000000,0.029745,0.000000,0.000000
adalah,0.026315,0.020892,0.000000,0.000000,0.000000,0.068306,0.039141,0.000000,0.029316,0.042282
...,...,...,...,...,...,...,...,...,...,...
yakni,0.000000,0.033083,0.000000,0.000000,0.000000,0.027041,0.000000,0.000000,0.000000,0.000000
yang,0.108740,0.043165,0.039809,0.066889,0.042550,0.070565,0.121308,0.092599,0.090857,0.058241
year,0.000000,0.000000,0.091533,0.000000,0.032612,0.000000,0.000000,0.000000,0.000000,0.000000
yordenis,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.157526


In [13]:
import numpy as np

def get_similar_articles(q, df):
    print("query:", q)
    print("The top documents with respectto highest cosine matching : ")
    
    # Convert the query become a vector
    
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    
    sim = {}
    # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    # Sort the values 
    sim_sorted = sorted(sim.items(), key = lambda x: x[1], reverse=True)
    
    # Print the articles and their similarity values
    for k, v in sim_sorted:
        if v != 0.0:
            print("Cosine Similarity:", v)
            print(documents_clean[k])
            print()

In [14]:
# Add The Query
q1 = 'barcelona buruk rezim josep maria bartomeu'

# Call the function
get_similar_articles(q1, df)

query: barcelona buruk rezim josep maria bartomeu
The top documents with respectto highest cosine matching : 
Cosine Similarity: 0.16528221709362265
 kompas com tidak butuh waktu lama untuk lionel messi mendapatkan klub anyar seusai berpisah dari barcelona hanya berselang dua hari setelah menangis dan berpamitan dengan barcelona lionel messi resmi diperkenalkan sebagai pemain anyar paris saint germain psg striker megabintang asal argentina itu resmi menandatangani kontrak berdurasi dua tahun di psg pada selasa dua hari sebelumnya tepatnya pada minggu messi baru saja menggelar konferensi pers untuk mengucapkan salam perpisahannya dengan barca serta fans setianya baca juga lionel messi resmi gabung psg jersey sang bintang lansung ludes dalam menit meski begitu drama perpisahan lionel messi dengan barcelona sebenarnya sudah berlangsung sejak lama dapatkan informasi inspirasi dan insight di email kamu daftarkan email pemain yang menghabiskan hampir seluruh kariernya bersama barcelona itu s