In [69]:
import pandas as pd
df = pd.read_csv("../data/legal_qa_summarized_full.csv", encoding='unicode_escape')

In [70]:
questions_list = df['question'].tolist()

In [71]:
answers_list = df['answer'].tolist()

In [72]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/gowgow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gowgow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gowgow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [74]:
def preprocess_with_stopwords(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters
    tokens = nltk.word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    return ' '.join(stemmed_tokens)

In [75]:
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
X = vectorizer.fit_transform([preprocess(q) for q in questions_list])



In [76]:
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(X, 'tfidf_matrix.joblib')

['tfidf_matrix.joblib']

In [None]:
vectorizer = joblib.load('tfidf_vectorizer.joblib')
X = joblib.load('tfidf_matrix.joblib')

In [77]:
def get_response(text):
    processed_text = preprocess_with_stopwords(text)
    print("processed_text:", processed_text)
    vectorized_text = vectorizer.transform([processed_text])
    similarities = cosine_similarity(vectorized_text, X)
    print("similarities:", similarities)
    max_similarity = np.max(similarities)
    print("max_similarity:", max_similarity)
    if max_similarity >= 0.3:
        high_similarity_questions = [q for q, s in zip(questions_list, similarities[0]) if s >= 0.3]
        print("high_similarity_questions:", high_similarity_questions)

        target_answers = []
        for q in high_similarity_questions:
            q_index = questions_list.index(q)
            target_answers.append(answers_list[q_index])
        print(target_answers)

        Z = vectorizer.fit_transform([preprocess_with_stopwords(q) for q in high_similarity_questions])
        processed_text_with_stopwords = preprocess_with_stopwords(text)
        print("processed_text_with_stopwords:", processed_text_with_stopwords)
        vectorized_text_with_stopwords = vectorizer.transform([processed_text_with_stopwords])
        final_similarities = cosine_similarity(vectorized_text_with_stopwords, Z)
        closest = np.argmax(final_similarities)
        return target_answers[closest]
    else:
        return "I can't answer this question."

In [78]:
get_response('find me a divorce lawyer')

processed_text: find me a divorc lawyer
similarities: [[0. 0. 0. ... 0. 0. 0.]]
max_similarity: 0.5574737601444921
['To contest divorce means a couple cannot come to an agreement about things like real property, child custody, or a division of assets. A If a respondent does not agree to the terms proposed in the original filing, they may contest divorce at that time. The exact amount of A divorce can only be contested in the court where the original petition was filed. Litigation required to conclude matters being contested will likely take In no-fault and uncontested divorces, a petitioner will often state irreconcilable differences as the overall reason for the A spouse may contest all of what is being stated or asked for in the original divorce petition or may simply choose to contest one or two The court will usually recommend that both parties hire their own attorney. Even if the spouse who originated the divorce filed the appropriate paperwork without assistance It is the respons

'Even in the best circumstances, divorce is difficult. In most cases, emotions run high while funds run low. Divorcing couples usually The Bar Association for each state should have a list of attorneys who provide pro bono, or free, services to individuals attempting to divorce A courthouse facilitator is an ideal choice if neither party can afford an attorney. Many divorce lawyers will offer a one-time consultation for'