# CISI Extraction

## Parse Function

In [16]:
def parse_cisi_file(filepath):
    documents = []
    current_doc = {}
    current_field = None
    buffer = []

    with open(filepath, 'r') as f:
        for line in f:
            line = line.rstrip()

            if line.startswith('.I'):
                if current_doc:
                    if buffer and current_field:
                        current_doc[current_field] = '\n'.join(buffer).strip()
                    documents.append(current_doc)

                current_doc = {"id": int(line.split()[1]), "references": []}
                current_field = None
                buffer = []

            elif line.startswith('.T'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'title'
                buffer = []

            elif line.startswith('.A'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'author'
                buffer = []

            elif line.startswith('.W'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'abstract'
                buffer = []

            elif line.startswith('.X'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'references'
                buffer = []

            else:
                if current_field == 'references':
                    if line.strip():  
                        parts = line.strip().split()
                        if len(parts) == 3:
                            ref_id, ref_type, count = map(int, parts)
                            current_doc['references'].append(ref_id)
                else:
                    buffer.append(line)

        if current_doc:
            if buffer and current_field and current_field != 'references':
                current_doc[current_field] = '\n'.join(buffer).strip()
            documents.append(current_doc)

    return documents

## Information Retrieval System

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import re

class IRSystem:
    def __init__(self, isStem, isEliminateStopWords, tfMode, isIDF, isNormalized):
        self.isStem = isStem
        self.isEliminateStopWords = isEliminateStopWords
        self.tfMode = tfMode
        self.isIDF = isIDF
        self.isNormalized = isNormalized
        self.stemmer = PorterStemmer()
        nltk.download('stopwords')
        self.stopwords = set(stopwords.words('english'))
        with open('words.txt', 'r') as file: # UBAH FILE PATH
            self.vocabulary = [line.strip() for line in file]
        with open('words.txt', 'r') as file:  # UBAH FILE PATH
            self.idfweight = np.array([line.strip() for line in file])

    def stem(self, text):
        if self.isStem:
            words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
            return [self.stemmer.stem(word) for word in words]
        else:
            cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
            cleaned_text = cleaned_text.lower()  
            words = [word for word in cleaned_text.split() if word]
            return words
    
    def eliminateStopWords(self, list):
        if self.isEliminateStopWords:
            return [word for word in list if word and word not in self.stopwords]
        return list
    
    def calculateTF(self, tokens):
        weight = np.array([0 for i in range (len(self.vocabulary) + 1)])
        unique_token = set(tokens)
        undefined_token = 0

        # calculate TF defined token
        for token in unique_token:
            try:
                idx = self.vocabulary.index(token)
                weight[idx] = tokens.count(token)
            except ValueError:
                undefined_token = undefined_token + 1
                continue

        weight[self.vocabulary] = undefined_token
                        
        max_list = np.max(weight)
        match self.tfMode:
            case 'natural':
                weight = weight
            case 'augmented':
                weight = 0.5 + (0.5 * weight / max_list)
            case 'logarithmic':
                weight = 1 + np.log2(weight, where=weight > 0, out=np.zeros_like(weight, dtype=float))
            case 'binary':
                weight = (weight > 0).astype(int)

        return weight
            
    def calculateIDF(self, weight):
        if self.isIDF:
            return weight * self.idfweight
        return weight
            
    def calculateWeight(self, token):
        weight = self.calculateTF(token)
        weight = self.calculateIDF(weight)
        return weight 
    
    def expand():
        pass

    def similarity():
        pass
    
    def retrieve(self, query):
        token = self.stem(query)
        token = self.eliminateStopWords(token)
        weight = self.calculateWeight(token)

        # Query Expansion
        weight = self.expand(weight)

        # Calculate
        document_rank = self.similarity(weight)
        return document_rank
    
class GenerativeAdversarialNetwork:
    def __init__(self):
        pass

    def discriminator():
        pass

    def generator():
        pass

    def forward():
        pass
    

## Precalculate Document

#### List Vocabulary

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import re

data = parse_cisi_file("../dataset/cisi.all")
stemmer = PorterStemmer()
author_set = set()
title_set = set()
abstract_set = set()
stop_words = set(stopwords.words('english'))

# def stem(text):
#     cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
#     cleaned_text = cleaned_text.lower()  
#     words = [word for word in cleaned_text.split() if word] # [stemmer.stem(word) for word in words if word not in stop_words]
#     return words

def stem(text):
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    return [stemmer.stem(word) for word in words] # [stemmer.stem(word) for word in words if word not in stop_words]

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(stem(author))
    if abstract:
        abstract_set.update(stem(abstract))
    if title:
        title_set.update(stem(title))

with open("../out/stemmed/full/author.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/stemmed/full/title.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/stemmed/full/abstract.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))

#### IDF

In [23]:
import numpy as np
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        # Open File Vocabulary
        with open(f'../vocabulary/{stemming}/{source}.txt', 'r') as file: 
            vocabulary = [line.strip() for line in file]

        library = []
        for document in data:
            if stemming == 'raw':
                words = re.sub(r'[^a-zA-Z\s]', ' ', document[source])
                words = words.lower()  
                library.append([word for word in words.split() if word])
            else: 
                words = re.findall(r'\b[a-zA-Z]+\b', document[source].lower())
                library.append([stemmer.stem(word) for word in words])

        # Iterate
        idf = np.array([0.0 for i in range (len(vocabulary) + 1)])
        for i in range (len(vocabulary)):
            count = 0
            for documentidx in range (len(library)):
                if vocabulary[i] in library[documentidx]:
                    count = count + 1
            idf[i] = np.log2(len(data)/count)

        # IDF token <UNKNOWN>
        idf[len(vocabulary)] = 1

        np.savetxt(f"../weight/idf/{stemming}/{source}.txt", idf, fmt='%.5f')

### TF

In [None]:
import numpy as np
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        # Open File Vocabulary
        with open(f'../vocabulary/{stemming}/{source}.txt', 'r') as file: 
            vocabulary = [line.strip() for line in file]

        for document in data:
            if stemming == 'raw':
                words = re.sub(r'[^a-zA-Z\s]', ' ', document[source])
                words = words.lower()  
                library.append([word for word in words.split() if word])
            else: 
                words = re.findall(r'\b[a-zA-Z]+\b', document[source].lower())
                library.append([stemmer.stem(word) for word in words])

            # Calculate
            # Inisialisasi weight = 0 untuk semua dengan dimensi sebanyak kata di vocabulary + 1 kata tambahan yaitu <UNKNOWN> token 
            idf = np.array([0.0 for i in range (len(vocabulary) + 1)])
            
            #TODO: recalculate weightnya

            # Weight untuk <UNKNOWN> = 0
            idf[len(vocabulary)] = 0

            #TODO: Save file (Pathnya hierarkinya jangan diubah yah)
            np.savetxt(f"../weight/tf/{stemming}/{document['id']}/{source}.txt", xx, fmt='%.5f')

### TF.IDF

In [None]:
import numpy as np

data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        #TODO: import idf
        for document in data:
            idx = data['id']
            #TODO: import tf dengan id = idx
            #TODO: new weight = tf * idf

            #TODO: Save file (Pathnya hierarkinya jangan diubah yah)
            np.savetxt(f"../weight/tf-idf/{stemming}/{document['id']}/{source}.txt", xx, fmt='%.5f')

### TF IDF Length

In [None]:
import numpy as np

data = parse_cisi_file("../dataset/cisi.all")

for stemming in ['stemmed', 'raw']:
    for source in ['abstract', 'author', 'title']:
        length = []
        for document in data:
            idx = data['id']
            #TODO: import tf.idf dengan id = idx
            #TODO: calculate math.sqrt(sum(component ** 2 for component in vector))

            length.append(magnitude)

        # Jadi nanti isinya length dari document 1-terakhir
        np.savetxt(f"../weight/tf-idf-length/{stemming}/{source}.txt", idf, fmt='%.5f')

{'id': 1,
 'references': [1, 1, 1, 1, 1, 556, 92, 262, 1004, 1024],
 'title': '18 Editions of the Dewey Decimal Classifications',
 'author': 'Comaromi, J.P.',
 'abstract': "The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad."}