In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import math

## First approach: we use NLTK library to tokenize a list of questions to binary vectors

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabedarley/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabedarley/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
def read_file(answer, question):
    answers, questions = [], []
    
    with open(answer, 'r') as file:
        answers = file.readlines()
        
    with open(question, 'r') as file:
        questions = file.readlines()
        
    return answers, questions    

In [14]:
!pwd
!ls

/Users/gabedarley/Desktop/cb/notebooks
NLTK-TFIDF.ipynb SQLite.ipynb     answers.txt      questions.txt


In [15]:
answers, questions = read_file("answers.txt", "questions.txt")
len(answers), len(questions)

(50, 50)

In [16]:
sw = stopwords.words("english")

def find_similarity(questions, user):
    ranks = []    
    
    # tokenize the user's question
    tokenized_user = word_tokenize(user)
    user_set = {w for w in tokenized_user if not w in sw}
    
    for idx, question in enumerate(questions):
        l1, l2 = [], []
        
        # tokenize the DB's question
        tokenized_question = word_tokenize(question)
        question_set = {w for w in tokenized_question if not w in sw}
        
        rvector = question_set.union(user_set)
        for w in rvector:
            if w in user_set: l1.append(1)
            else: l1.append(0)
                
            if w in question_set: l2.append(1)
            else: l2.append(0)
                
        c = 0
        # cosine formular
        for i in range(len(rvector)):
            c += l1[i]*l2[i]
        cosine = c / ((sum(l1)**0.5*(sum(l2))**0.5))
        
        # for each question, find its similarity to user's question
        ranks.append((idx, cosine))
        
    # sort the ranks
    ranks.sort(key=lambda y: y[1], reverse=True)
    return ranks

def answer(ranks, answers):
    f_idx, s_idx = ranks[0][0], ranks[1][0]
    
    print("The first answer is", answers[f_idx])
    print("The second answer is", answers[s_idx])

In [17]:
ranks = find_similarity(questions, "Can you tell me what an IEP is")
answer(ranks, answers)

The first answer is In making changes to a student's IEP after the annual IEP Team meeting for a school year, the parent of a student with a disability and the public agency may agree not to convene an IEP Team meeting for the purposes of making those changes, and instead may amend or modify the student's current IEP.

The second answer is The LDE shall ensure the following. 1.	Each public agency shall take steps, including the provision of supplementary aids and services determined appropriate and necessary by the student's IEP Team, to provide nonacademic and extracurricular services and activities in the manner necessary to afford students with disabilities an equal opportunity for participation in those services and activities. 2.	Nonacademic and extracurricular services and activities may include counseling services, athletics, transportation, health services, recreational activities, special interest groups or clubs sponsored by the public agency, referrals to agencies that provi

## Second approach: instead of using NLTK, we use TFIDF from scikit-learn library

In [18]:
# TFIDF stands for frequency-inverse document frequency
# try to find the most frequent and significant words

# tf-idf = term_frequency * inverse_document_frequency
# inverse_document_frequency = log(total number of documents / number of documents with term) + 1
# Ex: a word that appears a lot in 1-2 pages is significant

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
from pathlib import Path
import glob

In [93]:
directory_path = "../data"
text_files = glob.glob(f"{directory_path}/*.txt")
titles = [Path(text_file).stem for text_file in text_files]
text_files, titles
print(questions)
print(answers)

['What is an LEA?\n', 'What is IDEA?\n', 'What is FAPE?\n', 'What is an IEP?\n', 'What is LRE?\n', 'What is PBIS?\n', 'What is a FBA?\n', 'What is a BIP?\n', 'What is a MDR?\n', 'What is Child Find?\n', 'Should I have my child evaluated for special education?\n', 'What is RTI?\n', 'What is the School Building Level Committee?\n', 'Can the school district refuse to evaluate my child until he or she goes through the RTI process?\n', 'Can the school use my private evaluation to expedite the evaluation process?\n', 'Is the public school system required to evaluate my child that attends a private school for free?\n', 'What does 1508 evaluation mean?\n', 'What are the exceptionalities?\n', 'How long does it take to get my child evaluated?\n', 'Can the school ask for evaluation extensions?\n', 'What happens if the parent disagree with the evaluation?\n', 'If my child is eligible for special education services, how long will it take to start the services?\n', 'Will I receive a notice about the

In [21]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=titles, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.head()

In [None]:
tfidf_df.loc['doc_freq'] = (tfidf_df > 0).sum()
df = tfidf_df.T.sort_values(by=['questions', 'answers'], ascending=False)
df

### Analysis
Although TFIDF in this case is a good approach, it may not help us solve the problem of ranking questions with user's question in term of similarity. TFIDF will output the significance of a term based on all question/answer pairs. However, these pairs are independent and unrelated. Furthermore, a user's questiion and DB's question both have significant words but they are unrelated, so their similarity is incorrect.

### Tasks
1. Form a list of 21 pairs (answer+question)
2. Fit and transform TF-IDF vectorizer for these 21 pairs, then turn each pair(string) into a vector.
3. For any new sentence (user's input), transform it into a vector of the same dimension.
4. Use cosine similarity to rank user's input vs a list of defined questions

In [None]:
data = [answer + " " + question for question, answer in zip(questions, answers)]
len(data), data[:1]

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words="english")
tfidf_vector = tfidf_vectorizer.fit_transform(data)

print(tfidf_vector.shape)
tfidf_vector.toarray()

In [None]:
def find_similarity_tfidf(questions_vector, user, vectorizer):
    user_vector = vectorizer.transform([user]).toarray().flatten()

    ranks = []
    for idx in range(len(questions_vector)):
        question_vector = questions_vector[idx, :].flatten()

        # cosine formular
        c = 1.0 - cosine(question_vector, user_vector)
        ranks.append((idx, c))

    sorted_ranks = sorted(ranks, key=lambda x: x[1], reverse=True)
    return sorted_ranks

ranks = find_similarity_tfidf(tfidf_vector.toarray(), "Can you please tell me what IEP is?", tfidf_vectorizer)
answer(ranks, answers)

## Third approach: sentence similarity with Spacy using pre-trained model

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
def find_similarity_spacy(questions, user, model):
    user_doc = model(user)
    
    ranks = []
    for idx, question in enumerate(questions):
        question_doc = model(question)
        similarity = user_doc.similarity(question_doc)
        ranks.append((idx, similarity))
    
    sorted_ranks = sorted(ranks, key=lambda x: x[1], reverse=True)
    return sorted_ranks

ranks = find_similarity_spacy(questions, "Can you please tell me what IEP is?", nlp)
answer(ranks, answers)

## Fourth approach: sentence similarity with sentence transformers (the baseline is BERT model)

Resources:
1. https://github.com/UKPLab/sentence-transformers
2. https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [27]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 2.3 MB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 11.1 MB/s eta 0:00:01
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp38-cp38-macosx_10_11_x86_64.whl (3.7 MB)
[K     |████████████████████████████████| 3.7 MB 45.1 MB/s eta 0:00:01
Collecting torch>=1.6.0
  Downloading torch-1.10.2-cp38-none-macosx_10_9_x86_64.whl (147.2 MB)
[K     |████████████████████████████████| 147.2 MB 51 kB/s s eta 0:00:01    |██████████████▉                 | 68.4 MB 5.5 MB/s eta 0:00:15     |█████████████████▎              | 79.2 MB 5.5 MB/s eta 0:00:13
[?25hCollecting torchvision
  Downloading torchvision-0.11.3-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.1 MB/s eta 0:00:01
C

In [28]:
from scipy.spatial.distance import cosine

In [29]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3988.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=550.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265486777.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466081.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=450.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…




In [30]:
def find_similarity_transformer(questions, user, model):
    sentence_embeddings = model.encode(questions)
    user_embedding = model.encode(user)

    ranks = []
    for idx, embedding in enumerate(sentence_embeddings):
        c = 1.0 - cosine(user_embedding, embedding)
        ranks.append((idx, c))
        
    return sorted(ranks, key=lambda x: x[1], reverse=True)

In [92]:
user_question = "What are the rules regarding missing an IEP meeting?"
ranks = find_similarity_transformer(questions, user_question, model)

for i in range(len(ranks)):
        if ranks[i][0] == 29:
            correct_index = i

answer(ranks, answers)
print(ranks[correct_index][1])

The first answer is In making changes to a student's IEP after the annual IEP Team meeting for a school year, the parent of a student with a disability and the public agency may agree not to convene an IEP Team meeting for the purposes of making those changes, and instead may amend or modify the student's current IEP.

The second answer is Parents may disagree with all or some parts of the program, placement, or related services proposals. The LEA and the parents should make conciliatory attempts to resolve the disputes, including making modifications to the proposed program, placement, and related services. A LEA may not use a parent's refusal to consent to one service or activity to deny the parent or student any other service, benefit, or activity of the LEA.

0.7735286951065063
