In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math

## First approach: we use NLTK library to tokenize a list of questions to binary vectors

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ngodylan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ngodylan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def read_file(answer, question):
    answers, questions = [], []
    
    with open(answer, 'r') as file:
        answers = file.readlines()
        
    with open(question, 'r') as file:
        questions = file.readlines()
        
    return answers, questions    

In [4]:
!pwd
!ls

/Users/ngodylan/Downloads/Tulane/Fall 2021/CS Capstone/codes
Chatbot.ipynb  [34mFHFCapstone[m[m/


In [8]:
answers, questions = read_file("FHFCapstone/answers.txt", "FHFCapstone/questions.txt")
len(answers), len(questions)

(21, 21)

In [9]:
sw = stopwords.words("english")

def find_similarity(questions, user):
    ranks = []    
    
    # tokenize the user's question
    tokenized_user = word_tokenize(user)
    user_set = {w for w in tokenized_user if not w in sw}
    
    for idx, question in enumerate(questions):
        l1, l2 = [], []
        
        # tokenize the DB's question
        tokenized_question = word_tokenize(question)
        question_set = {w for w in tokenized_question if not w in sw}
        
        rvector = question_set.union(user_set)
        for w in rvector:
            if w in user_set: l1.append(1)
            else: l1.append(0)
                
            if w in question_set: l2.append(1)
            else: l2.append(0)
                
        c = 0
        # cosine formular
        for i in range(len(rvector)):
            c += l1[i]*l2[i]
        cosine = c / ((sum(l1)**0.5*(sum(l2))**0.5))
        
        # for each question, find its similarity to user's question
        ranks.append((idx, cosine))
        
    # sort the ranks
    ranks.sort(key=lambda y: y[1], reverse=True)
    return ranks

def answer(ranks, answers):
    f_idx, s_idx = ranks[0][0], ranks[1][0]
    
    print("The first answer is", answers[f_idx])
    print("The second answer is", answers[s_idx])

In [10]:
ranks = find_similarity(questions, "Can you tell me what an IEP is")
answer(ranks, answers)

The first answer is IEP is an individualized Education Program that provides a written plan designed to meet the unique needs of the child with an exceptionality.

The second answer is At any IEP Team meeting, the following participants shall be in attendance: an officially designated representative of the Local Education Agency (LEA), the student\'s regular education and special education teachers, the student\'s parents, and a person knowledgeable about the student\'s evaluation procedures and results. The student, as well as other individuals the parents and/or LEA may deem necessary, should be given the opportunity to attend. Documentation of attendance is required.



## Second approach: instead of using NLTK, we use TFIDF from scikit-learn library

In [11]:
# TFIDF stands for frequency-inverse document frequency
# try to find the most frequent and significant words

# tf-idf = term_frequency * inverse_document_frequency
# inverse_document_frequency = log(total number of documents / number of documents with term) + 1
# Ex: a word that appears a lot in 1-2 pages is significant

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import glob

In [13]:
directory_path = "./FHFCapstone"
text_files = glob.glob(f"{directory_path}/*.txt")
titles = [Path(text_file).stem for text_file in text_files]
text_files, titles

(['./FHFCapstone/questions.txt', './FHFCapstone/answers.txt'],
 ['questions', 'answers'])

In [14]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [16]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=titles, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,1508,1706,21,30,60,ability,academic,achieve,achievement,acquired,...,using,various,visual,vital,ward,way,welfare,work,written,yes
questions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
answers,0.070641,0.023547,0.023547,0.023547,0.023547,0.023547,0.047094,0.023547,0.023547,0.023547,...,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.047094,0.070641


In [17]:
tfidf_df.loc['doc_freq'] = (tfidf_df > 0).sum()
df = tfidf_df.T.sort_values(by=['questions', 'answers'], ascending=False)
df

Unnamed: 0,questions,answers,doc_freq
child,0.364101,0.284815,2.0
school,0.364101,0.167538,2.0
iep,0.364101,0.150784,2.0
evaluation,0.218460,0.134031,2.0
team,0.218460,0.083769,2.0
...,...,...,...
vital,0.000000,0.023547,1.0
ward,0.000000,0.023547,1.0
way,0.000000,0.023547,1.0
welfare,0.000000,0.023547,1.0


### Analysis
Although TFIDF in this case is a good approach, it may not help us solve the problem of ranking questions with user's question in term of similarity. TFIDF will output the significance of a term based on all question/answer pairs. However, these pairs are independent and unrelated. Furthermore, a user's questiion and DB's question both have significant words but they are unrelated, so their similarity is incorrect.

### task
1. Form a list of 21 pairs (answer+question)
2. Fit and transform TF-IDF vectorizer for these 21 pairs, then turn each pair (string) into a vector.
3. 