# CISI Extraction

## Parse Function

In [1]:
def parse_cisi_file(filepath):
    documents = []
    current_doc = {}
    current_field = None
    buffer = []

    with open(filepath, 'r') as f:
        for line in f:
            line = line.rstrip()

            if line.startswith('.I'):
                if current_doc:
                    if buffer and current_field:
                        current_doc[current_field] = '\n'.join(buffer).strip()
                    documents.append(current_doc)

                current_doc = {"id": int(line.split()[1]), "references": []}
                current_field = None
                buffer = []

            elif line.startswith('.T'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'title'
                buffer = []

            elif line.startswith('.A'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'author'
                buffer = []

            elif line.startswith('.W'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'abstract'
                buffer = []

            elif line.startswith('.X'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'references'
                buffer = []

            else:
                if current_field == 'references':
                    if line.strip():  
                        parts = line.strip().split()
                        if len(parts) == 3:
                            ref_id, ref_type, count = map(int, parts)
                            current_doc['references'].append(ref_id)
                else:
                    buffer.append(line)

        if current_doc:
            if buffer and current_field and current_field != 'references':
                current_doc[current_field] = '\n'.join(buffer).strip()
            documents.append(current_doc)

    return documents

In [91]:
import numpy as np
import os
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

with open('./vocabulary/raw/abstract.txt', 'r') as file:
    vocab = [line.strip() for line in file if line not in stopwords]

with open('./vocabulary/raw/title.txt', 'r') as file:
    vocab += [line.strip() for line in file if line not in stopwords]

with open('./vocabulary/raw/author.txt', 'r') as file:
    vocab += [line.strip() for line in file if line not in stopwords]

vocab = list(set(vocab))

word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

def one_hot(idx, size):
    vec = np.zeros(size)
    vec[idx] = 1.0
    return vec

X = np.array([one_hot(i, len(vocab)) for i in range(len(vocab))])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natthankrish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [92]:
from sklearn.neural_network import MLPRegressor
import joblib

y = X.copy()

model = MLPRegressor(hidden_layer_sizes=(5), max_iter=5000, activation='relu')
model.fit(X, y)
joblib.dump(model, 'model.joblib')

['model.joblib']

In [93]:
def get_embedding(word):
    idx = word_to_index[word]
    one_hot_vec = one_hot(idx, len(vocab)).reshape(1, -1)
    hidden = model.predict(one_hot_vec)
    return hidden.flatten()

In [94]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

vocab_embeddings = np.array([get_embedding(word) for word in vocab])
vocab_words = np.array(vocab)

np.save('vocab_embeddings.npy', vocab_embeddings)

with open('vocab_words.txt', 'w', encoding='utf-8') as f:
    for word in vocab_words:
        f.write(word + '\n')


In [79]:
vocab_words

array(['faced', 'spending', 'reflexive', ..., 'instigates', 'attacked',
       'prefer'], dtype='<U29')

In [78]:
def find_closest(input_word, top_k=1):
    input_emb = np.array(get_embedding(input_word)).reshape(1, -1)
    sims = cosine_similarity(input_emb, vocab_embeddings)[0]  
    top_k_indices = np.argsort(sims)[-top_k:][::-1]      
    closest_words = [vocab_words[i] for i in top_k_indices]
    similarities = [sims[i] for i in top_k_indices]
    return list(zip(closest_words, similarities))

print(find_closest("retain", top_k=3))

[('retain', 1.0000000000000002), ('equilibrium', 0.9999995232919022), ('hierarchic', 0.9999954623190725)]


## Information Retrieval System

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import re

class IRSystem:
    def __init__(self, isStem, isEliminateStopWords, tfMode, isIDF, isNormalized):
        self.isStem = isStem
        self.isEliminateStopWords = isEliminateStopWords
        self.tfMode = tfMode
        self.isIDF = isIDF
        self.isNormalized = isNormalized
        self.stemmer = PorterStemmer()
        nltk.download('stopwords')
        self.stopwords = set(stopwords.words('english'))
        with open('words.txt', 'r') as file: # UBAH FILE PATH
            self.vocabulary = [line.strip() for line in file]
        with open('words.txt', 'r') as file:  # UBAH FILE PATH
            self.idfweight = np.array([line.strip() for line in file])

    def stem(self, text):
        if self.isStem:
            words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
            return [self.stemmer.stem(word) for word in words]
            cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
            cleaned_text = cleaned_text.lower()  
            words = [word for word in cleaned_text.split() if word]
            return words
    
    def eliminateStopWords(self, list):
        if self.isEliminateStopWords:
            return [word for word in list if word and word not in self.stopwords]
        return list
    
    def calculateTF(self, tokens):
        weight = np.array([0 for i in range (len(self.vocabulary) + 1)])
        unique_token = set(tokens)
        undefined_token = 0

        for token in unique_token:
            try:
                idx = self.vocabulary.index(token)
                weight[idx] = tokens.count(token)
            except ValueError:
                undefined_token = undefined_token + 1
                continue

        weight[self.vocabulary] = undefined_token
                        
        max_list = np.max(weight)
        match self.tfMode:
            case 'natural':
                weight = weight
            case 'augmented':
                weight = 0.5 + (0.5 * weight / max_list)
            case 'logarithmic':
                weight = 1 + np.log2(weight, where=weight > 0, out=np.zeros_like(weight, dtype=float))
            case 'binary':
                weight = (weight > 0).astype(int)

        return weight
            
    def calculateIDF(self, weight):
        if self.isIDF:
            return weight * self.idfweight
        return weight
            
    def calculateWeight(self, token):
        weight = self.calculateTF(token)
        weight = self.calculateIDF(weight)
        return weight 
    
    def expand():
        pass

    def similarity(self, weight_token):
        # token_magnitude = weight_token.magnitude()
        # for i in range (len(self.weight_document)):
        #     res = self.weight_document[i] * weight_token
        #     if (self.isNormalized):
        #         res /= token_magnitude
        #         res /= self.weight_document.magnitude
        return [{"id": 1, "value": 1232}, {"id": 1, "value": 1.2},]
    
    def retrieve(self, query):
        token = self.stem(query)
        token = self.eliminateStopWords(token)
        weight = self.calculateWeight(token)

        # Query Expansion
        weight = self.expand(weight)

        # Calculate
        document_rank = self.similarity(weight)
        return document_rank
    
class GenerativeAdversarialNetwork:
    def __init__(self):
        pass

    def discriminator():
        pass

    def generator():
        pass

    def forward():
        pass
    

## Precalculate Document

#### List Vocabulary

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import re

data = parse_cisi_file("../dataset/cisi.all")
stemmer = PorterStemmer()
author_set = set()
title_set = set()
abstract_set = set()
stop_words = set(stopwords.words('english'))

# def stem(text):
#     cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
#     cleaned_text = cleaned_text.lower()  
#     words = [word for word in cleaned_text.split() if word] # [stemmer.stem(word) for word in words if word not in stop_words]
#     return words

def stem(text):
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    return [stemmer.stem(word) for word in words] # [stemmer.stem(word) for word in words if word not in stop_words]

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(stem(author))
    if abstract:
        abstract_set.update(stem(abstract))
    if title:
        title_set.update(stem(title))

with open("../out/stemmed/full/author.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/stemmed/full/title.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/stemmed/full/abstract.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))

FileNotFoundError: [Errno 2] No such file or directory: '../out/stemmed/full/author.txt'

#### IDF

In [4]:
import numpy as np
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        # Open File Vocabulary
        with open(f'../vocabulary/{stemming}/{source}.txt', 'r') as file: 
            vocabulary = [line.strip() for line in file]

        library = []
        for document in data:
            if stemming == 'raw':
                words = re.sub(r'[^a-zA-Z\s]', ' ', document[source])
                words = words.lower()  
                library.append([word for word in words.split() if word])
            else: 
                words = re.findall(r'\b[a-zA-Z]+\b', document[source].lower())
                library.append([stemmer.stem(word) for word in words])

        # Iterate
        idf = np.array([0.0 for i in range (len(vocabulary) + 1)])
        for i in range (len(vocabulary)):
            count = 0
            for documentidx in range (len(library)):
                if vocabulary[i] in library[documentidx]:
                    count = count + 1
            idf[i] = np.log2(len(data)/count)

        # IDF token <UNKNOWN>
        idf[len(vocabulary)] = 1

        np.savetxt(f"../weight/idf/{stemming}/{source}.txt", idf, fmt='%.5f')

### TF

In [5]:
import numpy as np
from nltk.stem import PorterStemmer
import re
import os
stemmer = PorterStemmer()
data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        # Open File Vocabulary
        with open(f'../vocabulary/{stemming}/{source}.txt', 'r') as file: 
            vocabulary = [line.strip() for line in file]

        for document in data:
            if stemming == 'raw':
                words = re.sub(r'[^a-zA-Z\s]', ' ', document[source])
                words = words.lower()  
                tokens = [word for word in words.split() if word]
            else: 
                words = re.findall(r'\b[a-zA-Z]+\b', document[source].lower())
                tokens = [stemmer.stem(word) for word in words]

            # Calculate
            # Inisialisasi weight = 0 untuk semua dengan dimensi sebanyak kata di vocabulary + 1 kata tambahan yaitu <UNKNOWN> token 
            tf= np.array([0.0 for i in range (len(vocabulary) + 1)])
            
            #TODO: recalculate weightnya
            count_word = {}
            for token in tokens :
                count_word[token] = count_word.get(token, 0) + 1
                
            for tfType in ['natural', 'augmented', 'logarithmic', 'binary'] :
                for idx, vocab_word in enumerate(vocabulary):
                    freq = count_word.get(vocab_word, 0)
                    if tfType == 'natural':
                        tf[idx] = freq
                    elif tfType == 'augmented':
                        max_tf = max(count_word.values()) if count_word else 1
                        tf[idx] = 0.5 + 0.5 * (freq / max_tf) if freq > 0 else 0
                    elif tfType == 'logarithmic':
                        tf[idx] = (1 + np.log10(freq)) if freq > 0 else 0
                    elif tfType == 'binary':
                        tf[idx] = 1 if freq > 0 else 0
                    


                # Weight untuk <UNKNOWN> = 0
                tf[len(vocabulary)] = 0

                #TODO: Save file (Pathnya hierarkinya jangan diubah yah)
 
                os.makedirs(f"../weight/tf/{tfType}/{stemming}/{document['id']}", exist_ok=True)
                # Simpan file
                # np.savetxt(f"{save_path}/{source}.txt", tf, fmt='%.5f')
                np.savetxt(f"../weight/tf/{tfType}/{stemming}/{document['id']}/{source}.txt", tf, fmt='%.5f')

### TF.IDF

In [6]:
import numpy as np
import os

data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        
        idf_path = f"../weight/idf/{stemming}/{source}.txt"
        idf = np.loadtxt(idf_path)

        for document in data:
            doc_id = document['id']

            for tfType in ['natural', 'augmented', 'logarithmic', 'binary']:
                # Load TF
                tf_path = f"../weight/tf/{tfType}/{stemming}/{doc_id}/{source}.txt"
                tf = np.loadtxt(tf_path)

                # Hitung TF-IDF
                tf_idf = tf * idf

                # Simpan hasil
                os.makedirs(f"../weight/tf-idf/{tfType}/{stemming}/{doc_id}", exist_ok=True)

                np.savetxt( f"../weight/tf-idf/{tfType}/{stemming}/{doc_id}/{source}.txt", tf_idf, fmt="%.5f")


### TF IDF Length

In [8]:
import numpy as np

data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        length = []
        for document in data:
            idx = document['id']
            for tfType in ['natural', 'augmented', 'logarithmic', 'binary']:
                #TODO: import tf.idf dengan id = idx
                tf_idf_path =  f"../weight/tf-idf/{tfType}/{stemming}/{doc_id}/{source}.txt"
                tf_idf = np.loadtxt(tf_idf_path)
                #TODO: calculate math.sqrt(sum(component ** 2 for component in vector))

                magnitude = np.sqrt(np.sum(tf_idf ** 2))
                length.append(magnitude)

                
                # Jadi nanti isinya length dari document 1-terakhir
                os.makedirs(f"../weight/tf-idf-length/{tfType}/{doc_id}/{stemming}", exist_ok=True)
                np.savetxt(f"../weight/tf-idf-length/{tfType}/{doc_id}/{stemming}/{source}.txt", idf, fmt='%.5f')