In [1]:
import numpy as np
import pandas as pd
import re
import os
import nltk
import math

In [2]:
corpus_path = "./nasa/"

filenames = os.listdir(corpus_path)

files = [os.path.join(corpus_path, filepath) for filepath in filenames]

data = {}

# get data
for i, f in enumerate(files, 0):
	with open(f) as fo:
		curr = fo.read()
		data[filenames[i]] = (curr)

In [3]:
# remove spec chars and lower case it
data = {doc:re.sub(r'[^\w\s\n]', ' ', data[doc]).lower() for doc in data}
data

{'emt01995.txt': ' \n\nintegration of mechanical design  analysis  and fabrication processes\n\nmechanical design has been integrated with thermal  structural and optical\nanalysis  and with fabrication  electronic import of the model geometry\neliminates the repetitive steps of geometry input to develop each analysis\nmodel  leading to faster and more accurate analyses  electronic transfer of\na part to fabrication eliminates the need to manually input a complex\ngeometry into a numeric control  nc  machine \n\n \n\npotential commercial uses\n\n     any design or manufacturing process  e g \n        o automotive\n        o appliance\n        o plastics\n        o airplane\n        o nuclear\n     laboratory optical testing\n     automated process analysis\n     nuclear plant analysis\n\nbenefits\n\n     rapid model development\n     accuracy and precision of models\n     ease of analysis transfer\n     automatic tolerance definition and fit checking\n     true optical performance pred

In [4]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))


# tokenise
data = {doc:[word for word in nltk.word_tokenize(data[doc]) if word not in stop_words] for doc in data}
data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gpaul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'emt01995.txt': ['integration',
  'mechanical',
  'design',
  'analysis',
  'fabrication',
  'processes',
  'mechanical',
  'design',
  'integrated',
  'thermal',
  'structural',
  'optical',
  'analysis',
  'fabrication',
  'electronic',
  'import',
  'model',
  'geometry',
  'eliminates',
  'repetitive',
  'steps',
  'geometry',
  'input',
  'develop',
  'analysis',
  'model',
  'leading',
  'faster',
  'accurate',
  'analyses',
  'electronic',
  'transfer',
  'part',
  'fabrication',
  'eliminates',
  'need',
  'manually',
  'input',
  'complex',
  'geometry',
  'numeric',
  'control',
  'nc',
  'machine',
  'potential',
  'commercial',
  'uses',
  'design',
  'manufacturing',
  'process',
  'e',
  'g',
  'automotive',
  'appliance',
  'plastics',
  'airplane',
  'nuclear',
  'laboratory',
  'optical',
  'testing',
  'automated',
  'process',
  'analysis',
  'nuclear',
  'plant',
  'analysis',
  'benefits',
  'rapid',
  'model',
  'development',
  'accuracy',
  'precision',
  'mode

In [5]:
# stem
stems_list = {}
stemmer = nltk.PorterStemmer()
for doc in data:
	stems_list[doc] = [stemmer.stem(word) for word in data[doc]] # type: ignore
stems_list


{'emt01995.txt': ['integr',
  'mechan',
  'design',
  'analysi',
  'fabric',
  'process',
  'mechan',
  'design',
  'integr',
  'thermal',
  'structur',
  'optic',
  'analysi',
  'fabric',
  'electron',
  'import',
  'model',
  'geometri',
  'elimin',
  'repetit',
  'step',
  'geometri',
  'input',
  'develop',
  'analysi',
  'model',
  'lead',
  'faster',
  'accur',
  'analys',
  'electron',
  'transfer',
  'part',
  'fabric',
  'elimin',
  'need',
  'manual',
  'input',
  'complex',
  'geometri',
  'numer',
  'control',
  'nc',
  'machin',
  'potenti',
  'commerci',
  'use',
  'design',
  'manufactur',
  'process',
  'e',
  'g',
  'automot',
  'applianc',
  'plastic',
  'airplan',
  'nuclear',
  'laboratori',
  'optic',
  'test',
  'autom',
  'process',
  'analysi',
  'nuclear',
  'plant',
  'analysi',
  'benefit',
  'rapid',
  'model',
  'develop',
  'accuraci',
  'precis',
  'model',
  'eas',
  'analysi',
  'transfer',
  'automat',
  'toler',
  'definit',
  'fit',
  'check',
  'tru

In [6]:
vocab = {}

for doc in stems_list:
	vocab[doc] = {}
	for word in stems_list[doc]:
		if word in vocab[doc]:
			vocab[doc][word] += 1
		else:
			vocab[doc][word] = 1

vocab

{'emt01995.txt': {'integr': 11,
  'mechan': 2,
  'design': 12,
  'analysi': 16,
  'fabric': 6,
  'process': 14,
  'thermal': 5,
  'structur': 6,
  'optic': 9,
  'electron': 3,
  'import': 2,
  'model': 8,
  'geometri': 4,
  'elimin': 2,
  'repetit': 1,
  'step': 2,
  'input': 2,
  'develop': 7,
  'lead': 1,
  'faster': 1,
  'accur': 2,
  'analys': 4,
  'transfer': 4,
  'part': 1,
  'need': 1,
  'manual': 2,
  'complex': 1,
  'numer': 1,
  'control': 1,
  'nc': 1,
  'machin': 1,
  'potenti': 1,
  'commerci': 7,
  'use': 6,
  'manufactur': 3,
  'e': 2,
  'g': 1,
  'automot': 2,
  'applianc': 1,
  'plastic': 1,
  'airplan': 1,
  'nuclear': 2,
  'laboratori': 1,
  'test': 2,
  'autom': 1,
  'plant': 1,
  'benefit': 1,
  'rapid': 2,
  'accuraci': 1,
  'precis': 1,
  'eas': 1,
  'automat': 1,
  'toler': 1,
  'definit': 1,
  'fit': 1,
  'check': 1,
  'true': 1,
  'perform': 2,
  'predict': 4,
  'effici': 1,
  'exact': 1,
  'technolog': 4,
  'mani': 2,
  'industri': 1,
  'recent': 1,
  'concer

In [7]:
tf_across_docs = {}

for doc in vocab:
	for word in vocab[doc]:
		if word in tf_across_docs:
			tf_across_docs[word] +=1
		else:
			tf_across_docs[word] = 1

tf_across_docs

{'integr': 40,
 'mechan': 36,
 'design': 74,
 'analysi': 43,
 'fabric': 29,
 'process': 141,
 'thermal': 28,
 'structur': 51,
 'optic': 22,
 'electron': 40,
 'import': 19,
 'model': 46,
 'geometri': 16,
 'elimin': 18,
 'repetit': 2,
 'step': 11,
 'input': 17,
 'develop': 141,
 'lead': 23,
 'faster': 9,
 'accur': 17,
 'analys': 7,
 'transfer': 141,
 'part': 127,
 'need': 47,
 'manual': 3,
 'complex': 29,
 'numer': 16,
 'control': 53,
 'nc': 1,
 'machin': 27,
 'potenti': 141,
 'commerci': 141,
 'use': 141,
 'manufactur': 33,
 'e': 20,
 'g': 14,
 'automot': 24,
 'applianc': 3,
 'plastic': 6,
 'airplan': 11,
 'nuclear': 11,
 'laboratori': 20,
 'test': 66,
 'autom': 21,
 'plant': 13,
 'benefit': 141,
 'rapid': 15,
 'accuraci': 17,
 'precis': 17,
 'eas': 8,
 'automat': 16,
 'toler': 11,
 'definit': 8,
 'fit': 7,
 'check': 3,
 'true': 6,
 'perform': 69,
 'predict': 24,
 'effici': 36,
 'exact': 3,
 'technolog': 141,
 'mani': 36,
 'industri': 102,
 'recent': 15,
 'concert': 2,
 'movement': 6,
 

In [8]:
# not doc specific
idf = {}

for doc in vocab:
	for word in vocab[doc]:
		noDocsWithWord = 0
		for doc in vocab:
			if word in vocab[doc]:
				noDocsWithWord += 1
		idf[word] = math.log(len(vocab) / noDocsWithWord)

idf

{'integr': 1.259880436264232,
 'mechan': 1.3652409519220583,
 'design': 0.6446947971739986,
 'analysi': 1.1875597746846058,
 'fabric': 1.5814640603916943,
 'process': 0.0,
 'thermal': 1.6165553802029644,
 'structur': 1.0169342576538425,
 'optic': 1.8577174370198524,
 'electron': 1.259880436264232,
 'import': 2.0043209112117277,
 'model': 1.1201184938890731,
 'geometri': 2.176171168138387,
 'elimin': 2.0583881324820035,
 'repetit': 4.255612709818223,
 'step': 2.5508646175797978,
 'input': 2.1155465463219523,
 'develop': 0.0,
 'lead': 1.8132656744490185,
 'faster': 2.751535313041949,
 'accur': 2.1155465463219523,
 'analys': 3.002849741322855,
 'transfer': 0.0,
 'part': 0.10457280391957703,
 'need': 1.0986122886681098,
 'manual': 3.8501476017100584,
 'complex': 1.5814640603916943,
 'numer': 2.176171168138387,
 'control': 0.9784679768260465,
 'nc': 4.948759890378168,
 'machin': 1.6529230243738393,
 'potenti': 0.0,
 'commerci': 0.0,
 'use': 0.0,
 'manufactur': 1.452252328911688,
 'e': 1.953

In [9]:
tfidf = {}

for doc in vocab:
	tfidf[doc] = {}
	for word in vocab[doc]:
		tfidf[doc][word] = vocab[doc][word] * idf[word]

tfidf

{'emt01995.txt': {'integr': 13.858684798906552,
  'mechan': 2.7304819038441166,
  'design': 7.7363375660879825,
  'analysi': 19.000956394953693,
  'fabric': 9.488784362350167,
  'process': 0.0,
  'thermal': 8.082776901014823,
  'structur': 6.101605545923055,
  'optic': 16.71945693317867,
  'electron': 3.779641308792696,
  'import': 4.0086418224234555,
  'model': 8.960947951112585,
  'geometri': 8.704684672553547,
  'elimin': 4.116776264964007,
  'repetit': 4.255612709818223,
  'step': 5.1017292351595955,
  'input': 4.2310930926439045,
  'develop': 0.0,
  'lead': 1.8132656744490185,
  'faster': 2.751535313041949,
  'accur': 4.2310930926439045,
  'analys': 12.01139896529142,
  'transfer': 0.0,
  'part': 0.10457280391957703,
  'need': 1.0986122886681098,
  'manual': 7.700295203420117,
  'complex': 1.5814640603916943,
  'numer': 2.176171168138387,
  'control': 0.9784679768260465,
  'nc': 4.948759890378168,
  'machin': 1.6529230243738393,
  'potenti': 0.0,
  'commerci': 0.0,
  'use': 0.0,
 

In [10]:
# input a query from the user
# emt02695
query = "Reliability & Maintainability Predictive Software"
data = {"query" : query}

query_rep = []

query_wordlist = []
for word in nltk.word_tokenize(query):
		query_wordlist.append(stemmer.stem(word))

for word in tf_across_docs:
		if word in query_wordlist:
				query_rep.append(1)
		else:
				query_rep.append(0)

query_rep

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [11]:
#open the pca csv file

pca = pd.read_csv("pca.csv", index_col=0)

#reduce the query to 100 dimensions
query_rep = np.array(query_rep)
query_rep = np.dot(query_rep, pca.T)
print(query_rep.shape)


(100,)


In [16]:
# Load the TF-IDF matrix from a CSV file and transpose it
tfidf_matrix_df = pd.read_csv("lsi_tfidfmatrix.csv", index_col=0).astype(float).T

# Convert the query_rep to a numpy array
query_rep = np.array(query_rep)

# Convert the TF-IDF matrix to a numpy array
tfidf = np.array(tfidf_matrix_df)

# Calculate the cosine similarity between the query and each document
cosine_sim = [
    np.dot(query_rep, doc) / (np.linalg.norm(query_rep) * np.linalg.norm(doc))
    for doc in tfidf
]

# Find the index of the document with the maximum cosine similarity
max_cosine_sim_index = np.argmax(cosine_sim)

# Find the header of the DataFrame as a list
headers = tfidf_matrix_df.columns.tolist()

# Find the top 10 documents by sorting based on cosine similarity
top_10_indices = sorted(range(len(cosine_sim)), key=lambda i: cosine_sim[i], reverse=True)[:10]

#the indices of the top 10 documents
top_10 = sorted(range(len(cosine_sim)), key=lambda i: cosine_sim[i])[-10:]

#the top 10 documents
top_10_docs = []
for i in top_10:
    top_10_docs.append(tfidf_matrix_df.T.columns.tolist()[i])


print(top_10_docs)

['39', '68', '58', '76', '43', '107', '82', '11', '132', '34']


In [18]:
#list of .txt in the directory nasa

files = []
for doc in os.listdir('nasa'):
    if doc.endswith(".txt"):
        files.append(doc)
print(files)
print(len(files))
for i in top_10_docs:
    print(files[int(i)])



['emt01995.txt', 'emt02495.txt', 'emt02695.txt', 'emt04395.txt', 'emt04495.txt', 'emt04595.txt', 'emt04795.txt', 'emt04895.txt', 'emt04995.txt', 'emt05095.txt', 'emt05995.txt', 'emt07295.txt', 'emt07895.txt', 'emt10195.txt', 'emt10395.txt', 'emt10495.txt', 'emt10695.txt', 'emt11895.txt', 'emt13295.txt', 'emt13495.txt', 'emt13895.txt', 'emt14295.txt', 'emt14395.txt', 'emt15895.txt', 'emt17495.txt', 'emt20895.txt', 'emt21795.txt', 'eos00395.txt', 'eos03595.txt', 'eos03695.txt', 'eos03795.txt', 'eos03995.txt', 'eos05595.txt', 'eos06695.txt', 'eos06795.txt', 'eos06895.txt', 'eos07195.txt', 'eos07795.txt', 'eos11695.txt', 'eos16095.txt', 'eos16995.txt', 'eos19595.txt', 'eos19895.txt', 'eos19995.txt', 'eos20195.txt', 'eos21295.txt', 'inf02895.txt', 'inf07395.txt', 'inf11495.txt', 'inf11595.txt', 'inf12495.txt', 'inf12795.txt', 'inf12995.txt', 'inf14495.txt', 'inf144b95.txt', 'inf17195.txt', 'inf17395.txt', 'inf18595.txt', 'inf18695.txt', 'inf19695.txt', 'inf21595.txt', 'inf21695.txt', 'ins01