# Fitness QA System: Development and User Model

## Imports

In [1]:
# Only need to run once for setting up virtual environment
#!pip install pandas
#!pip install scikit-learn
#!pip install nltk
#!pip install transformers
# !pip install tensorflow
#!pip install gensim
#!pip install transformers datasets evaluate



In [2]:
try:
  import gensim
  from gensim.models import Word2Vec
except:
  # reset environment
  !pip uninstall -y gensim numpy
  !pip install numpy==1.26.4
  !pip install gensim
  import gensim
  from gensim.models import Word2Vec

In [3]:
import json
import csv
import os
import pandas as pd
import ast
import re
import numpy as np
import unicodedata
import nltk
from nltk.corpus import stopwords
# Information Retrieval
from sklearn.feature_extraction.text import TfidfVectorizer
# Compute measurements
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import tensorflow as tf

## Set Up

For user interface (bottom section):
* Initialize each 'Functions' Section for each Pipeline section
* Run the below cells to load the models and data


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [187]:
# For experimenting with SQuAD2.0 Training Data --- OPTIONAL

# file paths for google drive
json_filepath_train = '/content/drive/MyDrive/4830/Data/train.json' # replace with correct local path

# Read in data from json
with open(json_filepath_train, 'r') as f:
    train_data = json.load(f)

print("Train data length:", len(train_data["data"]))

Train data length: 130317


In [22]:
# Fitness data
stackexchange_filepath = '/content/drive/MyDrive/4830/Data/fitness_squad_filtered.json' # replace with correct local path

# Read in data from json
with open(stackexchange_filepath, 'r') as f:
    fitness_data = json.load(f)

print("Train data length:", len(fitness_data["data"][0]["paragraphs"]))

Train data length: 3262


In [None]:
# Load the Word2Vec model
model= Word2Vec.load("word2vec_fitness.model")

## TF-IDF Pipeline

### Functions for TF-IDF pipeline

In [188]:
# Function for part 1 of tfidf_piepline function
def parse_data_dict(data):
    """
    Parse the SQuAD data dictionary to extract contexts and questions.
    """
    # initialize lists to hold all contexts and questions extracted
    contexts = []
    questions = []

    # iterate through each item in the json data to extract questions and contexts
    for item in data["data"]:
        for paragraph in item["paragraphs"]:
            context = paragraph["context"]
            qas = paragraph["qas"]
            contexts.append(context)
            for quest in qas:
                question = quest["question"]
                questions.append(question)

    # unique list of contexts
    context_list = list(set(contexts))
    question_list = list(set(questions))

    return context_list, question_list

In [189]:
# Function for part 2 of tfidf_pipeline function
def clean_text(text_list,remove_stopwords=True):
    """
    Cleans a list of text strings by applying standard preprocessing steps.

    Parameters:
    text_list : List[str]
        A list of raw text strings to be cleaned.

    remove_stopwords : bool, optional (default=True)
        Whether to remove English stopwords using NLTK's stopword list.

    Returns:
    List[str]
        A list of cleaned text strings with:
            - accents removed
            - text converted to lowercase
            - punctuation removed
    """
    final =[]
    stop_words = set(stopwords.words('english')) if remove_stopwords else set()

    for text in text_list:
        # Normalize unicode (accents)
        text = unicodedata.normalize('NFD', text)
        text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')

        # Convert to lowercase
        # doesn't seem to improve cosine similarity but will keep it in
        text = text.lower()

        # Remove newline and carriage return
        # removing newline does not improve cosine similarity
        #text = text.replace('\n', ' ').replace('\r', '').strip()

        # Remove punctuation
        text = ''.join([char for char in text if char.isalnum() or char.isspace()])

        # Remove stopwords
        # does not improve cosine similarity and takes longer to run
        if remove_stopwords:
            text = ' '.join([word for word in text.split() if word not in stop_words])

        # Append cleaned text to final list
        final.append(text)

    return final

In [190]:
# Function for part 5 of tfidf_pipeline function
def vectorize_question(question, vectorizer):
    """
    Vectorizes a single question using the provided TF-IDF vectorizer.

    Parameters:
    question : str
        The question to be vectorized.
    vectorizer : TfidfVectorizer
        The fitted TF-IDF vectorizer.

    Returns:
        The TF-IDF vector representation of the question.
    """
    # Clean the question
    # This is a list of one string
    question_cleaned = clean_text([question],remove_stopwords=False)

    # Vectorize the cleaned question
    question_vector = vectorizer.transform(question_cleaned)

    return question_vector

In [191]:
# Function for part 6 of tfidf_pipeline function
def compute_similarity(question_vector, context_vectors):
    """
    Computes cosine similarity between a question vector and context vectors.

    Parameters:
    question_vector : sparse matrix
        The TF-IDF vector representation of the question.
    context_vectors : sparse matrix
        The TF-IDF vector representations of the contexts.

    Returns:
        A list of cosine similarity scores between the question and each context.
    """

    similarities = cosine_similarity(question_vector, context_vectors)
    return similarities.flatten()

In [192]:
# Function for part 7 of tfidf_pipeline function
def get_top_k_contexts(similarities, k=2):
    """
    Get the indices of the top k most similar contexts based on cosine similarity scores.

    Parameters:
    similarities : np.ndarray
        The array of cosine similarity scores between the question and contexts.

    k : int, optional (default=2)
        The number of top contexts to retrieve.

    Returns:
    List[int]
        A list of indices corresponding to the top k most similar contexts.
    """
    # Get the indices of the top k most similar contexts
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return top_k_indices

In [193]:
def tfidf_pipeline(data, question, top_k=5, vectorizer=None, remove_stopwords=True):
    """
    Retrieve top-k most relevant contexts to a question using TF-IDF similarity.

    Parameters:
        data (dict): The data loaded from json file in SQuAD format.
        question (str): The question to find relevant contexts for.
        top_k (int): Number of top results to return.
        vectorizer (TfidfVectorizer): Optional, pass an existing fitted vectorizer.

    Returns:
        List[Tuple[str, float]]: List of (context, similarity score) pairs.
    """
    # 1. Extract contexts and questions from the data
    contexts, questions = parse_data_dict(data)

    # 2. Preprocess data by cleaning the text; removing stopwords here does not improve performance
    contexts_cleaned = clean_text(contexts,remove_stopwords=False)
    #print("number of contexts:",len(contexts_cleaned))

    # 3. Initialize the TF-IDF vectorizer
    if vectorizer is None:
        vectorizer = TfidfVectorizer(
            lowercase = True,
            ngram_range = (1, 1),
            stop_words = 'english'
        )

    # 4. Fit the vectorizer on the contexts
    context_vectors = vectorizer.fit_transform(contexts_cleaned)
    #print("TF-IDF matrix shape:", context_vectors.shape)

    # 5. Transform the question into a vector using the same vectorizer
    question_vector = vectorize_question(question, vectorizer)

    # 6. Compute the cosine similarity between the question vector and all context vectors
    similarities = compute_similarity(question_vector, context_vectors)

    # 7. Return the top-k (k=2) most similar contexts as candidates for answer extraction via BERT.
    top_k_indices = get_top_k_contexts(similarities, top_k)
    #results = [(contexts[i], similarities[i]) for i in top_k_indices]
    contexts = [contexts[i] for i in top_k_indices]
    similarities = [similarities[i] for i in top_k_indices]

    return contexts,similarities


### Assess TFIDF Pipeline

#### SQuAD2.0 Data Examples

In [None]:
## Examine in the SQuAD 2.0 data ##
# train_data["data"][0]

## Example extraction of context ##
# print(train_data["data"][0]["paragraphs"][0]["context"])

## Example extraction of question ##
# print(train_data["data"][0]["paragraphs"][0]["qas"][0]["question"])

In [194]:
answer_1 = train_data["data"][0]["paragraphs"][0]["qas"][0]["is_impossible"]
print(answer_1)
answer_2 = train_data["data"][100]["paragraphs"][0]["qas"][0]["is_impossible"]
print(answer_2)
answer_3 = train_data["data"][440]["paragraphs"][0]["qas"][0]["is_impossible"]
print(answer_3)

False
False
False


In [195]:
question_1 = train_data["data"][0]["paragraphs"][0]["qas"][0]["question"]
print(question_1)
question_2 = train_data["data"][100]["paragraphs"][0]["qas"][0]["question"]
print(question_2)
question_3 = train_data["data"][440]["paragraphs"][0]["qas"][0]["question"]
print(question_3)

When did Beyonce start becoming popular?
How many weeks did their single "Independent Women Part I" stay on top?
In addition to co-writing credits, Beyoncé also got what credits for most of her albums?


In [197]:
top_pairs_3, similarities = tfidf_pipeline(train_data, question_3, top_k=1)
print("Question:",question_3)
print("Top pairs:")
for context in top_pairs_3:
  for score in similarities:
    print(f"Context: {context}\nScore: {score}\n")

Question: In addition to co-writing credits, Beyoncé also got what credits for most of her albums?
Top pairs:
Context: She has received co-writing credits for most of the songs recorded with Destiny's Child and her solo efforts. Her early songs were personally driven and female-empowerment themed compositions like "Independent Women" and "Survivor", but after the start of her relationship with Jay Z she transitioned to more man-tending anthems such as "Cater 2 U". Beyoncé has also received co-producing credits for most of the records in which she has been involved, especially during her solo efforts. However, she does not formulate beats herself, but typically comes up with melodies and ideas during production, sharing them with producers.
Score: 0.34356668515014505



#### **Fitness Data**

In [42]:
f_contexts, f_questions = parse_data_dict(fitness_data)

In [39]:
### Question and Answer Pair Testing from Fitness Data ###
fit_Q1 = fitness_data["data"][0]["paragraphs"][0]["qas"][0]["question"]
fit_Q2 = fitness_data["data"][0]["paragraphs"][100]["qas"][0]["question"]
print("Q1:", fit_Q1)
print("Q2:",fit_Q2)

fit_A1 = fitness_data["data"][0]["paragraphs"][0]["qas"][0]["answers"][0]["text"]
fit_A2 = fitness_data["data"][0]["paragraphs"][100]["qas"][0]["answers"][0]["text"]
print("A1:",fit_A1)
print("A2:",fit_A2)

Q1: What's the difference between Whey Isolate and Whey Concentrate in shakes? What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.
Q2: What Supplements Should I Take When Restarting a New Exercise Plan? I spent about 6 to 7 years working out hard both in the gym and running, when I was in college.  When I graduated and got a job, I stopped working out cold turkey.  After about 3 years working, I realized I needed to get back into exercising regularly.

I've since since started working out again and weight more than ever.  I want to lose weight and gain muscle.

Should I take supplements that are all protein or should I take supplements that provide more carbohydrates?
A1: The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.

If you have trouble digesting the lactose or

In [40]:
# Initialize TF-IDF vectorizer from sklearn using default tokenizer
vectorize = TfidfVectorizer(
    lowercase = True,
    ngram_range = (1, 1),
    stop_words = 'english'
)
## Ngram range (1,2) gives higher cosine similarity than (1,3) [0.146 vs. 0.115]
## Ngram range (1,1) returns .344 cosine similarity compared to 0.146 with (1,2)

In [55]:
top_pairs_1 = tfidf_pipeline(fitness_data, fit_Q1, top_k=5, vectorizer=vectorize)
print("Question:",fit_Q1)
print("Top pairs:")
for context, score in top_pairs_1:
    print(f"Context: {context[:50]}\nScore: {score}\n")

number of contexts: 3262
TF-IDF matrix shape: (3262, 23925)
Question: What's the difference between Whey Isolate and Whey Concentrate in shakes? What's the difference? I'm looking at shake options and some contain whey isolate, some contain whey concentrate and some both.
Top pairs:
Context: For protein powder there are several things you ne
Score: 0.5252892391324091

Context: The main difference is in the "purity", how much l
Score: 0.49897755376687286

Context: It kinda sounds like you might just be developing 
Score: 0.37499823273323885

Context: Once And For All Time: Protein Shakes Are Not Magi
Score: 0.2839191055587551

Context: You should definitely eat after exercise.

Dependi
Score: 0.25940531355070673



In [57]:
top_pairs_2 = tfidf_pipeline(fitness_data, fit_Q2, top_k=5, vectorizer=vectorize)
print("Question:",fit_Q2)
print("Top pairs:")
for context, score in top_pairs_2:
    print(f"Context: {context[:50]}\nScore: {score}\n")

number of contexts: 3262
TF-IDF matrix shape: (3262, 23925)
Question: What Supplements Should I Take When Restarting a New Exercise Plan? I spent about 6 to 7 years working out hard both in the gym and running, when I was in college.  When I graduated and got a job, I stopped working out cold turkey.  After about 3 years working, I realized I needed to get back into exercising regularly.

I've since since started working out again and weight more than ever.  I want to lose weight and gain muscle.

Should I take supplements that are all protein or should I take supplements that provide more carbohydrates?
Top pairs:
Context: I wouldn't worry about supplements just yet, if yo
Score: 0.23928460768045348

Context: It's not a very specific question, it's basically 
Score: 0.22509600105485933

Context: I'll share my personal experience with you, since 
Score: 0.20926673068053245

Context: You should be taking your supplements daily to mai
Score: 0.1998932512341268

Context: Let's get somethi

## Word2Vec Pipeline

### Word2Vec Functions

In [60]:
import nltk
nltk.download('stopwords')

stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [90]:
def preprocess_word2vec(data):
    """
    Preprocesses the SQuAD data for Word2Vec training.

    Parameters:
    json_data : dict
        The SQuAD data loaded from a JSON file.

    Returns:
    List[str]
        A list of preprocessed sentences (contexts) for Word2Vec training.
    """
    # Extract contexts and questions from the data, returns a list of both
    context_list,q_list = parse_data_dict(data)

    # Combine contexts and questions into a single list and convert to more natural sentences
    raw_sentences = sent_tokenize(" ".join(context_list + q_list))
    # Clean the lists: remove accents, lowercase, remove punctuation
    cleaned_sentences = clean_text(raw_sentences,remove_stopwords=True)

    # Tokenize the sentences
    sentences = []
    for sent in cleaned_sentences:
        sentences.append(word_tokenize(sent))

    return sentences

In [112]:
def tfidf_weighted_avg_vector(text, model, idf_scores):
    """
    Computes a TF-IDF-weighted average Word2Vec vector for a given text.
    """
    # Tokenize the input text into lowercase words
    tokens = [w.lower() for w in nltk.word_tokenize(text)]

    # initialize empty lists to store vectors and their weights
    # vecs holds words vectors multiplied by their TF-IDF weight
    # weights holds the corresponding TF-IDF weights
    vecs, weights = [], []

    # loop through each token
    for w in tokens:
        # Check if the word is in both Word2Vec model and the IDF dictionary
        if w in model.wv and w in idf_scores:
            # Get the IDF weight for the word
            tfidf_weight = idf_scores[w]
            # get the Word2Vec vector for the word
            word_vector = model.wv[w]
            # apply weight to vector
            vecs.append(word_vector * tfidf_weight)

            # store weight for later averaging
            weights.append(tfidf_weight)

    return np.sum(vecs, axis=0) / np.sum(weights) if weights else np.zeros(model.vector_size)


In [111]:
# Initial Function used in building the pipeline - deprecated due to improved performance using weights
# Use tfidf_weight_avg_vector function instead
def averageVector(s,model):
    '''
    Compute the average word2Vec vector for a string using the given model.

    Parameters:
     s: input sentence; A string of words
     model: word2vec model; A gensim word2vec model

    Returns: the average vector representation of the sentence
    '''
    # tokenize the sentence using nltk word_tokenize.
    s_tokens = [w for w in nltk.word_tokenize(s.lower()) if w not in stop_words]

    # check if the word is in the word2vec model and get the vector for the word
    vectors = [model.wv[w] for w in s_tokens if w in model.wv]

    if len(vectors) == 0:
        # if no words were found in the model, return a zero vector
        # prevents division by zero
        return np.zeros(model.vector_size)

    # compute the average vector by summing the vectors and dividing by the number of vectors
    return np.mean(vectors, axis=0)

In [165]:
def word2vec_top_contexts(input_question,context_list,model=model,k=5):
  # Compute the average vector for the input question using the Word2Vec model
  #question_vec = averageVector(input_question, model) # deprecated averageVector function

  # Fit vectorizer on all contexts
  vectorizer = TfidfVectorizer()
  vectorizer.fit(context_list)

  idf_scores = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

  question_vec = tfidf_weighted_avg_vector(input_question, model, idf_scores)

  # Initialize a list of cosine similarity scores between the question and each context
  """similarities = [cosine_similarity([question_vec], [averageVector(c, model)])[0][0] # [0][0] extracts the scalar similarity
                  for c in context_list
                  ]"""

  similarities = [cosine_similarity([question_vec], [tfidf_weighted_avg_vector(c, model, idf_scores)])[0][0] # [0][0] extracts the scalar similarity
                  for c in context_list
                  ]

  # Get indices of the top-k most similar contexts (e.g., top 5)
  # Sort indices based on similarity scores in descending order
  top_k_indices = sorted(
      range(len(similarities)),           # Create a list of indices (0 to len-1)
      key=lambda i: similarities[i],      # Use similarity score as sort key
      reverse=True                        # Sort in descending order to get highest similarity first
      )[:k]                               # Slice to get top k=5 indices

  # Retrieve the actual top-k most similar context strings using the indices
  top_k_contexts = [context_list[i] for i in top_k_indices]

  return top_k_contexts, similarities


### Traing Word2Vec Model

In [95]:
nltk.download('punkt_tab')
squad_sentences = preprocess_word2vec(fitness_data)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [96]:
# Train Word2Vec on domain specific data
model = Word2Vec(sentences=squad_sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=10)

In [110]:
# Save the model
#model.save("word2vec_fitness.model")

### Assessing Results of the Model

In [172]:
# Load the model for future use
# model= Word2Vec.load("word2vec_fitness.model")

In [173]:
fit_Q1 = "What's the difference between Whey Isolate and Whey Concentrate in shakes?"

In [174]:
fit_Q2 = "I want to lose weight and gain muscle. Should I take supplements that are all protein or should I take supplements that provide more carbohydrates?"

In [175]:
# Extract the list of contexts
context_list, q_list = parse_data_dict(fitness_data)

In [176]:
top_k_contexts,similarities = word2vec_top_contexts(fit_Q1,context_list,model)
print("Question:",fit_Q1)
print("Top pairs:")
for context in top_k_contexts:
    print("Context:", context[:101])
    print("Similarity Score:", similarities[context_list.index(context)])
    print()

Question: What's the difference between Whey Isolate and Whey Concentrate in shakes?
Top pairs:
Context: For protein powder there are several things you need to be concerned about:


Protein quality/type: T
Similarity Score: 0.9075297

Context: The cheapest way would be to see if there are any other added amino acids other than protein that are
Similarity Score: 0.8831798

Context: The main difference is in the "purity", how much lactose and fat is left with the protein after filte
Similarity Score: 0.8802362

Context: It kinda sounds like you might just be developing a lactose intolerance. People don't just start out 
Similarity Score: 0.8794478

Context: Don't worry about mixing, just make sure flavors match. Creatine supplements are mostly have no flavo
Similarity Score: 0.86877865



In [177]:
top_contexts_q2,similarities = word2vec_top_contexts(fit_Q2,context_list,model)
print("Question:",fit_Q2)
print("Top pairs:")
for context in top_contexts_q2:
    print("Context:", context[:101])
    print("Similarity Score:", similarities[context_list.index(context)])
    print()

Question: I want to lose weight and gain muscle. Should I take supplements that are all protein or should I take supplements that provide more carbohydrates?
Top pairs:
Context: Let's get something straight. To gain muslce, or gain weight for that matter, you don't need to consu
Similarity Score: 0.9190468

Context: I wouldn't worry about supplements just yet, if you're only getting back into exercising now. They ar
Similarity Score: 0.9054449

Context: if there are any side effect


Glutamine can cause your stools to loosen a bit. The other stuff may h
Similarity Score: 0.9011702

Context: Realistically, protein is just protein. Certainly there are some minor differences in types of protei
Similarity Score: 0.9005274

Context: In order to get big, you need to eat a calorie surplus (more than you burn in a day). To get cut, you
Similarity Score: 0.89868593



## Putting it together: Assessing Extractive QA with Pipeline Inputs

In [126]:
# notebook for fine tuning the question answering model is 02_BERT_Finetuning_SQuAD
from transformers import pipeline

question_answerer = pipeline("question-answering", model="ekfrench/distilbert-finetuned-squad")

Some layers from the model checkpoint at ekfrench/distilbert-finetuned-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at ekfrench/distilbert-finetuned-squad and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use 0


**Compare answers returned for question 1**

In [83]:
fit_Q1

"What's the difference between Whey Isolate and Whey Concentrate in shakes?"

In [106]:
for context in top_k_contexts:
  print(question_answerer(question=fit_Q1, context=context))

{'score': 0.006766576319932938, 'start': 241, 'end': 302, 'answer': 'mix of isolate/concentrate and possibly other protein powders'}
{'score': 0.0015676221810281277, 'start': 409, 'end': 450, 'answer': "It's not a guarantee that they're spiking"}
{'score': 0.0619531013071537, 'start': 190, 'end': 196, 'answer': '70-85%'}
{'score': 0.0060552991926670074, 'start': 773, 'end': 780, 'answer': 'lactose'}
{'score': 0.012375938706099987, 'start': 173, 'end': 232, 'answer': 'not a good idea to mix it with whey unless they have flavor'}


In [82]:
for context in top_pairs_1:
  print(question_answerer(question=fit_Q1, context=context[0]))

{'score': 0.006766576319932938, 'start': 241, 'end': 302, 'answer': 'mix of isolate/concentrate and possibly other protein powders'}
{'score': 0.0619531013071537, 'start': 190, 'end': 196, 'answer': '70-85%'}
{'score': 0.0060552991926670074, 'start': 773, 'end': 780, 'answer': 'lactose'}
{'score': 0.014425982721149921, 'start': 1970, 'end': 2009, 'answer': 'crappy whey powders that have bad stuff'}
{'score': 0.003203597152605653, 'start': 316, 'end': 371, 'answer': 'yes it will matter but NOT as much as most people think'}


In [154]:
print("Full Answer from Dataset:")
print(fit_A1)

Expected Answer from Dataset:
The main difference is in the "purity", how much lactose and fat is left with the protein after filtering. Whey isolate usually contains around 90% protein and whey concentrate is more like 70-85%.

If you have trouble digesting the lactose or are trying to minimize carbohydrate content, then whey isolate would be a good choice. Otherwise, it probably doesn't matter; just pick the concentrate since it's cheaper in terms of protein grams/dollar.


**Results Discussion for Question 1:**

3 of the 5 answers returned for the pipelines are the same. They also both pulled from the original answer in the dataset. An important caveat to remember is for the fitness datasest the context is also the answer in the raw data.

**Compare answers returned for Question 2**

In [107]:
fit_Q2

'I want to lose weight and gain muscle. Should I take supplements that are all protein or should I take supplements that provide more carbohydrates?'

In [108]:
# From word2vec pipeline
for context in top_contexts_q2:
  print(question_answerer(question=fit_Q2, context=context))

{'score': 0.028294716030359268, 'start': 512, 'end': 547, 'answer': 'If you recently started working out'}
{'score': 0.00713271414861083, 'start': 343, 'end': 402, 'answer': "I think that you're going to be better off with the protein"}
{'score': 0.0040982067584991455, 'start': 309, 'end': 361, 'answer': 'whether I should take them daily or pre/post-workout'}
{'score': 0.00011420416558394209, 'start': 587, 'end': 650, 'answer': 'you would want to balance your ratios of protein, carbohydrates'}
{'score': 0.0036743134260177612, 'start': 1145, 'end': 1193, 'answer': "If you're providing yourself with good nutrition"}


In [109]:
# From TF-IDF pipeline
for context in top_pairs_2:
  print(question_answerer(question=fit_Q2, context=context[0]))

{'score': 0.00713271414861083, 'start': 343, 'end': 402, 'answer': "I think that you're going to be better off with the protein"}
{'score': 0.007747635245323181, 'start': 1691, 'end': 1721, 'answer': 'protein shakes, weight gainers'}
{'score': 0.0026461035013198853, 'start': 712, 'end': 778, 'answer': 'carbs are what keep skinny people from gaining weight, not protein'}
{'score': 0.0062279063276946545, 'start': 38, 'end': 65, 'answer': 'daily to maintain your load'}
{'score': 0.028294716030359268, 'start': 512, 'end': 547, 'answer': 'If you recently started working out'}


In [155]:
print("Full Answer from Dataset:")
print(fit_A2)

Full Answer from Dataset:
I wouldn't worry about supplements just yet, if you're only getting back into exercising now. They are intended to finesse results. The core of your plan should be healthy diet and exercise; you certainly don't need supplements to lose weight or to be healthy in general.

If you are going to pick between a focus on protein and carbohydrates I think that you're going to be better off with the protein. Protein is used to build up muscle and also helps suppress your appetite. Carbohydrates are extra energy, which you really don't need; certainly you'll pick some up in your diet (I'm not necessarily advocating for or against a very low-carb diet like Atkins - although it will work).


**Results Discussion for Question 2:**

As you can see there is overlap between the answers returned for both pipelines. You can see the answer from the dataset is reflected in both results from from the pipelines ("I think that you're going to be better off with the protein").

## **Fitness QA System: Ask Your Question Here!**

### Function for User Input Pipeline

Initialize this section

In [178]:
from transformers.utils import logging
logging.set_verbosity_error()

In [179]:
def load_QA_model():
    try:
      question_answerer = pipeline("question-answering", model="ekfrench/distilbert-finetuned-squad")
    except:
      from transformers import pipeline
      question_answerer = pipeline("question-answering", model="ekfrench/distilbert-finetuned-squad")

    return question_answerer

In [184]:
def fitness_QA_pipeline():
  # Get the user question
  question = input("What is your fitness related question? ")

  # Load fitness_data and extract contexts
  #fitness_data = load_json("fitness_data.json")
  context_list, q_list = parse_data_dict(fitness_data)

  # Get top TFIDF Contexts
  top_k_tfidf,similarities_tfidf = tfidf_pipeline(fitness_data, question, top_k=2)

  # Get top Word2Vec Contexts
  top_k_w2v,similarities = word2vec_top_contexts(question,context_list,model,k=2)

  # concatenate the lists to input to the question answering pipeline
  all_contexts = list(set(top_k_tfidf + top_k_w2v))

  # Load the question answering pipeline using the DistilBERT fine-tuned model
  question_answerer = load_QA_model()

  # Get answers to the question by passing all contexts to the question answering pipeline
  all_answers = []
  for context in all_contexts:
    answer = question_answerer(question=question, context=context)
    all_answers.append(answer)

  # Sort all_answers and print answers in order of highest to lowest score
  all_answers.sort(key=lambda x: x['score'], reverse=True)
  print("Your Top Answers:")
  for a in all_answers:
    print(a['answer'])

### User Input and Answer

In [186]:
# Just run this cell and it will prompt you for input
fitness_QA_pipeline()

What is your fitness related question? i want to become fat
Your Top Answers:
you have to eat a lot
more solid
fat is satiating--it makes you feel full
