In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math

## First approach: we use NLTK library to tokenize a list of questions to binary vectors

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ngodylan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ngodylan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def read_file(answer, question):
    answers, questions = [], []
    
    with open(answer, 'r') as file:
        answers = file.readlines()
        
    with open(question, 'r') as file:
        questions = file.readlines()
        
    return answers, questions    

In [5]:
!pwd
!ls

/Users/ngodylan/Downloads/Tulane/Fall 2021/CS Capstone/codes
[34mFHFCapstone[m[m/      NLTK-TFIDF.ipynb


In [6]:
answers, questions = read_file("FHFCapstone/answers.txt", "FHFCapstone/questions.txt")
len(answers), len(questions)

(21, 21)

In [7]:
sw = stopwords.words("english")

def find_similarity(questions, user):
    ranks = []    
    
    # tokenize the user's question
    tokenized_user = word_tokenize(user)
    user_set = {w for w in tokenized_user if not w in sw}
    
    for idx, question in enumerate(questions):
        l1, l2 = [], []
        
        # tokenize the DB's question
        tokenized_question = word_tokenize(question)
        question_set = {w for w in tokenized_question if not w in sw}
        
        rvector = question_set.union(user_set)
        for w in rvector:
            if w in user_set: l1.append(1)
            else: l1.append(0)
                
            if w in question_set: l2.append(1)
            else: l2.append(0)
                
        c = 0
        # cosine formular
        for i in range(len(rvector)):
            c += l1[i]*l2[i]
        cosine = c / ((sum(l1)**0.5*(sum(l2))**0.5))
        
        # for each question, find its similarity to user's question
        ranks.append((idx, cosine))
        
    # sort the ranks
    ranks.sort(key=lambda y: y[1], reverse=True)
    return ranks

def answer(ranks, answers):
    f_idx, s_idx = ranks[0][0], ranks[1][0]
    
    print("The first answer is", answers[f_idx])
    print("The second answer is", answers[s_idx])

In [8]:
ranks = find_similarity(questions, "Can you tell me what an IEP is")
answer(ranks, answers)

The first answer is IEP is an individualized Education Program that provides a written plan designed to meet the unique needs of the child with an exceptionality.

The second answer is At any IEP Team meeting, the following participants shall be in attendance: an officially designated representative of the Local Education Agency (LEA), the student\'s regular education and special education teachers, the student\'s parents, and a person knowledgeable about the student\'s evaluation procedures and results. The student, as well as other individuals the parents and/or LEA may deem necessary, should be given the opportunity to attend. Documentation of attendance is required.



## Second approach: instead of using NLTK, we use TFIDF from scikit-learn library

In [9]:
# TFIDF stands for frequency-inverse document frequency
# try to find the most frequent and significant words

# tf-idf = term_frequency * inverse_document_frequency
# inverse_document_frequency = log(total number of documents / number of documents with term) + 1
# Ex: a word that appears a lot in 1-2 pages is significant

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
from pathlib import Path
import glob

In [11]:
directory_path = "./FHFCapstone"
text_files = glob.glob(f"{directory_path}/*.txt")
titles = [Path(text_file).stem for text_file in text_files]
text_files, titles

(['./FHFCapstone/questions.txt', './FHFCapstone/answers.txt'],
 ['questions', 'answers'])

In [12]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [13]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=titles, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,1508,1706,21,30,60,ability,academic,achieve,achievement,acquired,...,using,various,visual,vital,ward,way,welfare,work,written,yes
questions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
answers,0.070641,0.023547,0.023547,0.023547,0.023547,0.023547,0.047094,0.023547,0.023547,0.023547,...,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.023547,0.047094,0.070641


In [14]:
tfidf_df.loc['doc_freq'] = (tfidf_df > 0).sum()
df = tfidf_df.T.sort_values(by=['questions', 'answers'], ascending=False)
df

Unnamed: 0,questions,answers,doc_freq
child,0.364101,0.284815,2.0
school,0.364101,0.167538,2.0
iep,0.364101,0.150784,2.0
evaluation,0.218460,0.134031,2.0
team,0.218460,0.083769,2.0
...,...,...,...
vital,0.000000,0.023547,1.0
ward,0.000000,0.023547,1.0
way,0.000000,0.023547,1.0
welfare,0.000000,0.023547,1.0


### Analysis
Although TFIDF in this case is a good approach, it may not help us solve the problem of ranking questions with user's question in term of similarity. TFIDF will output the significance of a term based on all question/answer pairs. However, these pairs are independent and unrelated. Furthermore, a user's questiion and DB's question both have significant words but they are unrelated, so their similarity is incorrect.

### Tasks
1. Form a list of 21 pairs (answer+question)
2. Fit and transform TF-IDF vectorizer for these 21 pairs, then turn each pair(string) into a vector.
3. For any new sentence (user's input), transform it into a vector of the same dimension.
4. Use cosine similarity to rank user's input vs a list of defined questions

In [15]:
data = [answer + " " + question for question, answer in zip(questions, answers)]
len(data), data[:1]

(21,
 ['IEP is an individualized Education Program that provides a written plan designed to meet the unique needs of the child with an exceptionality.\n What is an IEP\n'])

In [16]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(data)

print(tfidf_vector.shape)
tfidf_vector.toarray()

(21, 351)


array([[0.        , 0.        , 0.        , ..., 0.        , 0.28268659,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.10922357, 0.        ,
        0.08694272],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [22]:
def find_similarity_tfidf(questions_vector, user, vectorizer):
    user_vector = vectorizer.transform([user]).toarray().flatten()

    ranks = []
    for idx in range(len(questions_vector)):
        question_vector = questions_vector[idx, :].flatten()

        # cosine formular
        c = 1.0 - cosine(question_vector, user_vector)
        ranks.append((idx, c))

    sorted_ranks = sorted(ranks, key=lambda x: x[1], reverse=True)
    return sorted_ranks

ranks = find_similarity_tfidf(tfidf_vector.toarray(), "Can you please tell me what IEP is?", tfidf_vectorizer)
answer(ranks, answers)

The first answer is The notice shall indicate the purpose, time, and location of the IEP Team meeting; who will be in attendance; when a LEA IEP Team member needs to be excused from attending the meeting; the parents\' right to take other participants to the meeting; the student\'s right to participate (when appropriate); and the name of the person in the LEA the parents can contact when they have questions or concerns.

The second answer is IEP is an individualized Education Program that provides a written plan designed to meet the unique needs of the child with an exceptionality.



## Third approach: sentence similarity with Spacy using pre-trained model

In [24]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting spacy
  Downloading spacy-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 3.9 MB/s eta 0:00:01
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp38-cp38-macosx_10_9_x86_64.whl (609 kB)
[K     |████████████████████████████████| 609 kB 31.2 MB/s eta 0:00:01
[?25hCollecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 4.1 MB/s  eta 0:00:01
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.1-py3-none-any.whl (7.0 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.5-cp38-cp38-macosx_10_9_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 12.3 MB/s eta 0:00:01
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.2.1.tar.gz (173 kB)
[K     |████████████████████████████████| 173 kB 18.9 MB/s eta 0:00:01
Collecting

Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [25]:
import spacy
nlp = spacy.load("en_core_web_md")

In [28]:
def find_similarity_spacy(questions, user, model):
    user_doc = model(user)
    
    ranks = []
    for idx, question in enumerate(questions):
        question_doc = model(question)
        similarity = user_doc.similarity(question_doc)
        ranks.append((idx, similarity))
    
    sorted_ranks = sorted(ranks, key=lambda x: x[1], reverse=True)
    return sorted_ranks

ranks = find_similarity_spacy(questions, "Can you please tell me what IEP is?", nlp)
answer(ranks, answers)

The first answer is The initial evaluation must be conducted within 60 business days of receiving parental consent.

The second answer is FAPE is Free Appropriate Public Education, the provision in IDEA to ensure states provide services to eligible students with disabilities. The FREE in FAPE means school services must be provided at public expense, under public supervision and direction, without charge.



### Analysis
Write something here

## Fourth approach: sentence similarity with sentence transformers (the baseline is BERT model)

Resources:
1. https://github.com/UKPLab/sentence-transformers
2. https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [29]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 2.7 MB/s eta 0:00:011
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 15.6 MB/s eta 0:00:01
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp38-cp38-macosx_10_11_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 34.7 MB/s eta 0:00:01
Collecting torchvision
  Downloading torchvision-0.11.1-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 27.5 MB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 52.8 MB/s eta 0:00:01
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████|

In [43]:
from scipy.spatial.distance import cosine

In [30]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: What is an IEP

Embedding: [-1.17975533e+00 -4.96721745e-01 -2.79933333e-01 -5.52341282e-01
 -1.74581796e-01 -8.72177482e-01  1.00196898e+00  9.14519250e-01
 -4.86281395e-01 -8.71136636e-02  8.66368860e-02  4.68244970e-01
  1.67360716e-02 -2.74369717e-01  7.17253014e-02 -5.13054550e-01
 -4.53970909e-01  2.72854626e-01 -4.24025357e-01 -1.04111063e+00
  2.55754739e-01 -3.55217785e-01 -3.97026569e-01 -2.53913719e-02
 -3.05562049e-01  3.12842518e-01  5.49238503e-01 -6.55805886e-01
 -2.43770763e-01 -5.24343669e-01  5.76362669e-01  3.96598786e-01
  4.31394398e-01  3.53122234e-01  1.75683811e-01 -6.04147911e-01
 -6.06590450e-01 -1.05823658e-01 -5.73278368e-01 -1.90221772e-01
  2.87710339e-01 -4.91764545e-01 -5.25943562e-02 -8.39806259e-01
 -8.78193200e-01  2.69893795e-01 -9.00010109e-01  4.23725605e-01
 -1.44528389e+00 -8.09964180e-01 -1.13005042e-01  4.38063622e-01
  1.08074808e+00 -2.75241375e-01 -1.34656906e+00 -5.85071504e-01
 -9.69652832e-01  1.39278835e-02  3.67748499e-01  2.5

  1.99057326e-01 -9.09649506e-02  6.40699804e-01  7.78688043e-02]

Sentence: If my child is eligible for special education services, how long will it take to start the services?

Embedding: [ 6.57299235e-02 -1.81685492e-01  4.53952670e-01 -6.62809968e-01
  5.58809698e-01 -9.39226151e-02  5.21855354e-01 -6.37329042e-01
  1.02967262e-01 -2.65399575e-01 -1.83434576e-01  1.63163528e-01
  3.78166884e-01  5.46424806e-01  8.36832941e-01 -5.28977811e-01
 -5.11839569e-01  2.75162280e-01 -1.25535476e+00 -9.54840899e-01
 -1.47798821e-01 -1.56576633e-01 -3.35504323e-01  5.50854504e-01
  1.14205152e-01  4.03982341e-01  3.62695724e-01  3.65818799e-01
  5.95877647e-01 -1.70685098e-01  1.78593680e-01  3.36050540e-01
  1.54397100e-01  4.25150484e-01  7.59965837e-01 -3.11651349e-01
  4.45455432e-01 -9.33884084e-02  6.29888296e-01 -1.97411656e+00
  1.65576503e-01 -1.09945416e+00  3.54383215e-02  7.31268644e-01
 -5.62593102e-01  2.78048217e-01  2.08414197e-02 -4.20847505e-01
 -3.69725436e-01 -5.40484965e-

In [61]:
def find_similarity_transformer(questions, user, model):
    sentence_embeddings = model.encode(questions)
    user_embedding = model.encode(user)

    ranks = []
    for idx, embedding in enumerate(sentence_embeddings):
        c = 1.0 - cosine(user_embedding, embedding)
        ranks.append((idx, c))
        
    return sorted(ranks, key=lambda x: x[1], reverse=True)

In [71]:
user_question = "how long to start the services, if my child is eligible for special education services?"
ranks = find_similarity_transformer(questions, user_question, model)
answer(ranks, answers)

The first answer is The local education agency (LEA) also referred to as the school district has a maximum of 30 calendar days to complete the IEP/placement document for an eligible student.

The second answer is If your child is between 3 and 21 and having academic, social, or behavioral problems, you may consider getting your child evaluated. In Louisiana, this is called a 1508 evaluation.  Many struggling learners first go through the School Building Level Committee (SBLC) and receive interventions (RTI) before they are referred for an evaluation.

