In [1]:
import os
import random
import json
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer




In [4]:
dir = os.getcwd()
data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)
output_dir = os.path.join(dir, 'output')
os.makedirs(output_dir, exist_ok=True)

In [5]:
with open(os.path.join(output_dir, "kqa_answers_gpt4_five.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    gpt4 = [json.loads(line) for line in jsonl_file]

In [6]:
with open(os.path.join(output_dir, "kqa_answers_llama_five.jsonl"), 'r', encoding='utf-8') as jsonl_file:
    llama = [json.loads(line) for line in jsonl_file]

# Number of words of GPT-4 and Llama answers

In [5]:
vectorizer = CountVectorizer()
corpus = [item['answer'] for item in gpt4]  # Assuming 'question' field contains the text
X = vectorizer.fit_transform(corpus)
token_counts = X.sum(axis=1)

np.max(token_counts), np.mean(token_counts), np.min(token_counts)

(221, 116.06, 49)

In [6]:
vectorizer = CountVectorizer()
corpus = [item['answer'] for item in llama]  # Assuming 'question' field contains the text
X = vectorizer.fit_transform(corpus)
token_counts = X.sum(axis=1)

np.max(token_counts), np.mean(token_counts), np.min(token_counts)

(232, 136.62, 4)

In [9]:
for i in random.sample(corpus, 5):
    print(i)
    print("__________________________")

The duration of muscle strain pain can vary depending on the severity of the injury and individual factors. Generally, mild muscle strains can resolve on their own within 7-10 days. Moderate strains may take 2-3 weeks to heal, while severe strains can take up to 6 weeks or more to fully recover. It's essential to rest the affected area, apply ice, and stretch gently to promote healing. Over-the-counter pain relievers such as acetaminophen (Tylenol) or ibuprofen (Advil) can help manage pain and inflammation. If your symptoms worsen or persist, it's best to consult with a healthcare professional for further evaluation and guidance.
__________________________
Muscle aches, also known as myalgias, can be caused by a variety of factors beyond just exercise. If you are experiencing persistent muscle aches without exercising, it is reasonable to investigate the cause. Possible explanations include underlying medical conditions such as hypothyroidism, anemia, vitamin deficiencies (e.g., vitami

# Number of words and sentences in GPT-4 and Llama answers

In [14]:
def count_sentences(list_texts, physician=False):
    
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")
    # Process the text with spaCy to segment into sentences
    n_sentences = []
    for d in list_texts:
        if physician:
            doc = nlp(d['Free_form_answer'])
        else:
            doc = nlp(d['answer'])
        n_sentences.append(len([sentence.text for sentence in doc.sents]))
    
    return n_sentences

In [10]:
gpt4_sentences = count_sentences(gpt4)
llama_sentences = count_sentences(llama)

In [11]:
np.max(gpt4_sentences), np.mean(gpt4_sentences), np.min(gpt4_sentences)

(17, 6.06, 2)

In [12]:
np.max(llama_sentences), np.mean(llama_sentences), np.min(llama_sentences)

(20, 7.13, 1)

In [None]:
physician_sentences = count_sentences(gpt4, True)

In [16]:
np.max(physician_sentences), np.mean(physician_sentences), np.min(physician_sentences)

(15, 4.62, 1)