# CISI Extraction

## Parse Function

In [2]:
def parse_cisi_file(filepath):
    documents = []
    current_doc = {}
    current_field = None
    buffer = []

    with open(filepath, 'r') as f:
        for line in f:
            line = line.rstrip()

            if line.startswith('.I'):
                if current_doc:
                    if buffer and current_field:
                        current_doc[current_field] = '\n'.join(buffer).strip()
                    documents.append(current_doc)

                current_doc = {"id": int(line.split()[1]), "references": []}
                current_field = None
                buffer = []

            elif line.startswith('.T'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'title'
                buffer = []

            elif line.startswith('.A'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'author'
                buffer = []

            elif line.startswith('.W'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'abstract'
                buffer = []

            elif line.startswith('.X'):
                if buffer and current_field:
                    current_doc[current_field] = '\n'.join(buffer).strip()
                current_field = 'references'
                buffer = []

            else:
                if current_field == 'references':
                    if line.strip():  
                        parts = line.strip().split()
                        if len(parts) == 3:
                            ref_id, ref_type, count = map(int, parts)
                            current_doc['references'].append(ref_id)
                else:
                    buffer.append(line)

        if current_doc:
            if buffer and current_field and current_field != 'references':
                current_doc[current_field] = '\n'.join(buffer).strip()
            documents.append(current_doc)

    return documents

## Process Data

### Stemmed

#### Full

In [22]:
from nltk.stem import PorterStemmer
import re

data = parse_cisi_file("../dataset/cisi.all")
stemmer = PorterStemmer()
author_set = set()
title_set = set()
abstract_set = set()
stop_words = set(stopwords.words('english'))

def stem(text):
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    return [stemmer.stem(word) for word in words]

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(stem(author))
    if abstract:
        abstract_set.update(stem(abstract))
    if title:
        title_set.update(stem(title))

with open("../out/stemmed/full/authors.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/stemmed/full/titles.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/stemmed/full/abstracts.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))


#### Stop Word Elimination

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

data = parse_cisi_file("../dataset/cisi.all")
stemmer = PorterStemmer()
author_set = set()
title_set = set()
abstract_set = set()

def stem(text):
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    return [stemmer.stem(word) for word in words if word not in stop_words]

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(stem(author))
    if abstract:
        abstract_set.update(stem(abstract))
    if title:
        title_set.update(stem(title))

with open("../out/stemmed/filtered/authors.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/stemmed/filtered/titles.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/stemmed/filtered/abstracts.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natthankrish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Raw Text

#### Full

In [None]:
import re

data = parse_cisi_file("../dataset/cisi.all")
author_set = set()
title_set = set()
abstract_set = set()

def split(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    cleaned_text = cleaned_text.lower()  
    words = [word for word in cleaned_text.split() if word]
    return words

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(split(author))
    if abstract:
        abstract_set.update(split(abstract))
    if title:
        title_set.update(split(title))

with open("../out/raw/full/authors.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/raw/full/titles.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/raw/full/abstracts.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))


#### Stop Word Elimination

In [16]:
import re
import nltk
from nltk.corpus import stopwords

data = parse_cisi_file("../dataset/cisi.all")
author_set = set()
title_set = set()
abstract_set = set()
stop_words = set(stopwords.words('english'))

def split(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    cleaned_text = cleaned_text.lower()  
    words = [word for word in cleaned_text.split() if word and word not in stop_words]
    return words

for item in data:
    author = item.get('author', '')
    abstract = item.get('abstract', '')
    title = item.get('title', '')

    if author:
        author_set.update(split(author))
    if abstract:
        abstract_set.update(split(abstract))
    if title:
        title_set.update(split(title))

with open("../out/raw/filtered/authors.txt", "w") as f:
    f.write("\n".join(sorted(author_set)))

with open("../out/raw/filtered/titles.txt", "w") as f:
    f.write("\n".join(sorted(title_set)))

with open("../out/raw/filtered/abstracts.txt", "w") as f:
    f.write("\n".join(sorted(abstract_set)))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natthankrish/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
