In [1]:
import pandas as pd

df = pd.read_csv('database/eric_records.csv')
df.dropna(inplace=True)

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, CountVectorizer, TfidfVectorizer
import numpy as np

# Sample text data
documents = df['description'].values
# Initialize HashingVectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False, decode_error='ignore', strip_accents='unicode', analyzer='word', ngram_range=(1,1))

# Convert text data into hashed feature vectors
hashed_features = vectorizer.fit_transform(documents)


In [16]:
import numpy as np
import faiss

# tfidf_transformer = TfidfTransformer()

# hashed_features = tfidf_transformer.fit_transform(hashed_features)

# Compute TF-IDF weighted feature vectors
tfidf_vectors = hashed_features.toarray()

# Compute the norm of each TF-IDF vector
tfidf_norm = np.linalg.norm(tfidf_vectors, axis=1, keepdims=True)

# Avoid division by zero by replacing zero norms with small values
tfidf_normed = np.divide(tfidf_vectors, tfidf_norm, out=np.zeros_like(tfidf_vectors), where=tfidf_norm != 0)

# Choose the index type (e.g., IndexFlatIP for inner product similarity)
index = faiss.IndexFlatIP(tfidf_vectors.shape[1])  # Assuming tfidf_vectors.shape[1] is the dimensionality of the vectors

# Add TF-IDF vectors to the index
index.add(tfidf_normed.astype(np.float32))

faiss.write_index(index, 'database/eric_index.index')

In [10]:
index.ntotal

62244

In [107]:
from sklearn.naive_bayes import MultinomialNB

index.reconstruct(1)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [89]:
query_text = "technology and education."
# Assuming query_text is your query text
query_vector = vectorizer.transform([query_text]).toarray()
query_normed = query_vector / np.linalg.norm(query_vector)

# Perform the search
k = 1000  # Number of nearest neighbors to retrieve
_, result_indices = index.search(query_normed.astype(np.float32), k)

# Get the corresponding documents based on the retrieved indices
similar_documents = [documents[i] for i in result_indices[0]]

In [90]:
similar_documents

["Technology has undergone a lot of radical changes in the last years which have caused the implemention of new paradigms in different sectors. It is almost impossible for education not to be affected from this change in technology. It has shifted from the traditional applications to the technology use in the classrooms. In this case, teachers' role in the application of technology into education has become an essential part of the research in the field. Also, in higher education, teachers become the key factors to the effective use of technology in the teaching and learning processes. Thus, teachers' technology acceptance level remains an important issue. The term, technology acceptance refers to the adoption and use of technologies in the way they were designed for. The purpose of this study is to examine the technology acceptance level of teachers at Anadolu University School of Foreign Languages to test the Unified Theory of Acceptance and Use of Technology (UTAUT) that determines 