# Dataset Extracting
Extract document files into dataset

In [None]:
import os
import json
DOCS_DIR = '/content/drive/MyDrive/Classroom/Sistem Temu Balik Informasi IF-910 (AB) Ganjil 2023 2024/Tubes/combined'
# list of docs name
DOCS_FILE_NAME = os.listdir(DOCS_DIR)


In [None]:
def dump_json(filename, data):
  with open(filename, "w") as file:
        json.dump(data, file)

def load_json(filename):
  with open(filename, "r") as file:
    return json.load(file)

In [None]:
# save the docs list into json
dump_json("docs_list.json", DOCS_FILE_NAME)

In [None]:
import nltk
nltk.download('punkt')

# storing documents into dataset list
dataset = []

for doc in DOCS_FILE_NAME:
  file_path = os.path.join(DOCS_DIR, doc)
  file = open(file_path, 'r')
  with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
    text = file.read().replace('\n', ' ').strip()
  text = nltk.word_tokenize(text)
  file.close()

  dataset.append(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
dump_json("raw_dataset.json", dataset)

# Preprosessing

## Convert to lowercase

In [None]:
import numpy as np

In [None]:
def convert_lower_case(doc):
  return np.char.lower(np.array(doc, dtype=np.str_))

## Stop words

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

def stop_word_filtering(doc):
  return [word for word in doc if word not in stop_words]

## Punctuation Remover

In [None]:
import string

translation_table = str.maketrans("", "", string.punctuation)

In [None]:
def remove_punctuation(doc):
  return [word.translate(translation_table) for word in doc if word.translate(translation_table) != '']

## Binary Word Remover

In [None]:
def remove_non_printable_from_words(doc):
    def remove_non_printable(word):
        return ''.join(char for char in word if char in string.printable)

    return [remove_non_printable(word) for word in doc]

## Apostrophe Remover

In [None]:
def remove_apostrophe(doc):
  return [word.replace("'", "") for word in doc]

## Single Char Remover

In [None]:
def remove_single_char(doc):
  return [word for word in doc if len(word) > 1]

## Stemming

In [None]:
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [None]:
def word_stemming(doc):
  return [porter_stemmer.stem(word) for word in doc]

## Lemmatisation

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    return tag_dict.get(tag, wordnet.NOUN)

def word_lemmatization(doc):
  return [lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in doc]

## Converting Numbers to word

In [None]:
!pip install num2words
from num2words import num2words

Collecting num2words
  Downloading num2words-0.5.13-py3-none-any.whl (143 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m122.9/143.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.3/143.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=020177af2a43973efb43ce60f5d16f2fb7ac8b661fec6230bae01e19a87cd55a
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installi

In [None]:
def is_string_number(s):
    try:
        int_value = int(s)
        return True
    except ValueError:
        return False

def number_to_word(num):
  words = num2words(num)
  token = nltk.word_tokenize(words)
  doc = convert_lower_case(token)
  return doc


def convert_number(doc):
    result = []
    for word in doc:
        if is_string_number(word):
            result.extend(number_to_word(word))
        else:
            result.append(word)
    return result

## Complete Preprocessing

In [None]:
def preprocess(doc):
  doc = convert_number(doc)
  doc = convert_lower_case(doc)
  doc = stop_word_filtering(doc)
  doc = remove_non_printable_from_words(doc)
  doc = remove_punctuation(doc)
  doc = remove_apostrophe(doc)
  doc = remove_single_char(doc)
  doc = word_lemmatization(doc)
  doc = word_stemming(doc)

  return doc


# Preprocessing the dataset

## Load dataset

In [None]:
dataset = load_json("raw_dataset.json")

## Preprocess the dataset

In [None]:
preprocessed_docs = []

for doc in dataset:
  preprocessed_docs.append(preprocess(doc))

In [None]:
dump_json("docs_preprocessed.json", preprocessed_docs)

# TF-IDF

## Calculate Term Frequencies (TF)

In [None]:
def calculate_tf(docs):
    tf = []
    for doc in docs:
        doc_tf = {}
        for word in doc:
            doc_tf[word] = doc_tf.get(word, 0) + 1
        tf.append(doc_tf)
    return tf

## Calculate Inverse Document Frequency (IDF)

In [None]:
import math

In [None]:
def calculate_idf(docs):
    idf = {}
    total_documents = len(docs)

    for doc in docs:
        unique_words = set(doc)
        for word in unique_words:
            idf[word] = idf.get(word, 0) + 1

    for word, count in idf.items():
        idf[word] = math.log10(total_documents / count)

    return idf


## Calculate TF-IDF

In [None]:
def calculate_tfidf(docs, tf, idf):
    tfidf = []
    for i, doc in enumerate(docs):
        doc_tfidf = {}
        for word in doc:
            doc_tfidf[word] = tf[i][word] * idf.get(word, 0)
        tfidf.append(doc_tfidf)
    return tfidf

## Consine Similarity

In [None]:
def cosine_similarity(query_tfidf, doc_tfidf):
    dot_product = sum(query_tfidf.get(word, 0) * doc_tfidf.get(word, 0) for word in set(query_tfidf) & set(doc_tfidf))
    query_norm = math.sqrt(sum(value**2 for value in query_tfidf.values()))
    document_norm = math.sqrt(sum(value**2 for value in doc_tfidf.values()))

    if query_norm == 0 or document_norm == 0:
        return 0

    similarity = dot_product / (query_norm * document_norm)
    return similarity

## Find Matching Documents

In [None]:
def find_matching_documents(query_tfidf, documents_tfidf):
    similarities = [cosine_similarity(query_tfidf, doc_tfidf) for doc_tfidf in documents_tfidf]

    matching_documents = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    return matching_documents

# Find Documents using Query

## Load preprocessed documents

In [None]:
docs = load_json("docs_preprocessed.json")

## Documents TF-IDF

In [None]:
tf = calculate_tf(docs)
idf = calculate_idf(docs)
docs_tfidf = calculate_tfidf(docs, tf, idf)

In [None]:
dump_json("docs_tf.json", tf)
dump_json("docs_idf.json", idf)
dump_json("docs_tfidf.json", docs_tfidf)

## Query Preprocessing

In [None]:
query = "In 2006, the UK decides on approving the EU constitution, seen by some as crucial for EU efficiency and by others as a step towards federalism."
query = "indonesia independent day"
preprocessed_query = preprocess(nltk.word_tokenize(query))

## Query TF-IDF

In [None]:
query_tfidf = calculate_tfidf([preprocessed_query], calculate_tf([preprocessed_query]), idf)[0]

In [None]:
result = find_matching_documents(query_tfidf, docs_tfidf)

In [None]:
[tupel for tupel in result if tupel[1] > 0.09]

[(3859, 0.29804171501263027),
 (11, 0.26472209044781947),
 (615, 0.166472471186825),
 (717, 0.10436083338482859)]

In [None]:
DOCS_FILE_NAME[result[0][0]]

'indo58'