In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

**1a- Load prepared data from git**

In [14]:
import requests
response = requests.get("https://raw.githubusercontent.com/nguyenguyen/dataset/master/NLP%20Tutorials/TF-IDF_Doc.txt")
raw_data = response.text.split("\n")
raw_data

['Sống không giận, không hờn, không oán trách.',
 'Sống mỉm cười với thử thách chông gai.',
 'Sống vươn lên theo nhịp ánh ban mai.',
 'Sống an hòa với những người chung sống.',
 'Sống là động nhưng lòng luôn bất động.',
 'Sống là thương nhưng lòng chẳng vấn vương.',
 'Sống yên vui, danh lợi mãi coi thường.',
 'Tâm bất biến giữa dòng đời vạn biến.']

**1b - Mount data from drive**

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
FILE_PATH = "/content/TF-IDF_Doc.txt"
with open(FILE_PATH, "r", encoding="utf-8") as file:
  raw_data = file.readlines()
raw_data

['TP HCM cách ly người đến từ Điện Biên, Bắc Giang\n',
 'HCDC tối 15/5 thêm vào danh sách giám sát y tế người đến/ở từ 9 địa điểm tại Điện Biên và một điểm tại Bắc Giang, do liên quan bệnh nhân Covid-19.\n',
 '\n',
 'Theo Trung tâm Kiểm soát Bệnh tật TP HCM (HCDC), người đến thành phố từ Điện Biên phải cách ly tại nhà 21 ngày, lấy mẫu xét nghiệm nCoV.\n',
 '\n',
 'Các địa điểm bổ sung vào giám sát của Điện Biên, gồm:\n',
 '\n',
 '- Quán phở Châu Hùng, đối diện Tòa án nhân dân tỉnh, từ 7h đến 7h30 ngày 1/5.\n',
 '\n',
 '- Khu du lịch sinh thái Him Lam, TP Điện Biên Phủ, từ 9h đến13h ngày 1/5.\n',
 '\n',
 '- Phòng khám thai bác sĩ Nhung, phường Him Lam, TP Điện Biên Phủ, từ 16h đến 17h30 ngày 1/5.\n',
 '\n',
 '- Quán chè Thủy C, Chợ Trung tâm 1, TP Điện Biên Phủ, từ 17h đến 17h30 ngày 1/5.\n',
 '\n',
 '- Quán cà phê King Coffee, phường Him Lam, TP Điện Biên Phủ, từ 20h-21h ngày 1/5.\n',
 '\n',
 '- Siêu thị Hoa Ba, TP Điện Biên Phủ, từ 8h20-9h ngày 2/5 và 16h-17h ngày 2/5.\n',
 '\n',
 '- 

**2 - Split raw data to multiple documents**

In [17]:
list_docs = []
for i, line in enumerate(raw_data):
  doc_path = f"/content/TF-IDF_doc{i+1}.txt"
  with open(doc_path, "w+", encoding="utf-8") as file:
    file.write(line.replace("\n", ""))
    file.close()
  list_docs.append(doc_path)
list_docs

['/content/TF-IDF_doc1.txt',
 '/content/TF-IDF_doc2.txt',
 '/content/TF-IDF_doc3.txt',
 '/content/TF-IDF_doc4.txt',
 '/content/TF-IDF_doc5.txt',
 '/content/TF-IDF_doc6.txt',
 '/content/TF-IDF_doc7.txt',
 '/content/TF-IDF_doc8.txt',
 '/content/TF-IDF_doc9.txt',
 '/content/TF-IDF_doc10.txt',
 '/content/TF-IDF_doc11.txt',
 '/content/TF-IDF_doc12.txt',
 '/content/TF-IDF_doc13.txt',
 '/content/TF-IDF_doc14.txt',
 '/content/TF-IDF_doc15.txt',
 '/content/TF-IDF_doc16.txt',
 '/content/TF-IDF_doc17.txt',
 '/content/TF-IDF_doc18.txt',
 '/content/TF-IDF_doc19.txt',
 '/content/TF-IDF_doc20.txt',
 '/content/TF-IDF_doc21.txt',
 '/content/TF-IDF_doc22.txt',
 '/content/TF-IDF_doc23.txt',
 '/content/TF-IDF_doc24.txt',
 '/content/TF-IDF_doc25.txt',
 '/content/TF-IDF_doc26.txt',
 '/content/TF-IDF_doc27.txt',
 '/content/TF-IDF_doc28.txt',
 '/content/TF-IDF_doc29.txt',
 '/content/TF-IDF_doc30.txt',
 '/content/TF-IDF_doc31.txt',
 '/content/TF-IDF_doc32.txt']

**3 - Perform TF-IDF on all documents**

In [18]:
vectorizer = TfidfVectorizer(input="filename")
X = vectorizer.fit_transform(list_docs).toarray()
vocab = vectorizer.get_feature_names()

**4 - Format TF-IDF result as pandas dataframe**

In [19]:
columns = [x[-8:-4] for x in list_docs]
Tfidf_result = pd.DataFrame(X.transpose(), columns=columns, index=vocab)
Tfidf_result.head(48)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,oc10,oc11,oc12,oc13,oc14,oc15,oc16,oc17,oc18,oc19,oc20,oc21,oc22,oc23,oc24,oc25,oc26,oc27,oc28,oc29,oc30,oc31,oc32
080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127454,0.0,0.0,0.0,0.0
127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127454,0.0,0.0,0.0,0.0
15,0.0,0.201553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15h30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286872,0.0,0.0,0.0,0.271126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15h40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286872,0.0,0.250302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127454,0.0,0.0,0.0,0.0
16h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240727,0.0,0.0,0.0,0.0,0.0,0.258093,0.0,0.262584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306927,0.0,0.0,0.0,0.281966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17h30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262993,0.0,0.306927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**5 - Get** *n* **most important words of specified document**

In [20]:
def get_most_important_words(num_of_word, doc_name, tfidf_df):
  result = tfidf_df.nlargest(num_of_word, doc_name, keep='first')[doc_name]
  return result

In [21]:
get_most_important_words(3, ['doc4'], Tfidf_result)

Unnamed: 0,doc4
kiểm,0.229945
phải,0.229945
soát,0.229945
