In [1]:
!pip install sentence_transformers



In [2]:
from sentence_transformers import SentenceTransformer, util
import torch
import os
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
base_path = '/content/drive/MyDrive/Colab Notebooks/dataset/medical/'


In [5]:
content = []
names = []
with os.scandir(base_path) as entries:
    for entry in entries:
        names.append(entry.name)
        file = open(base_path+entry.name,"r+", encoding="utf8")
        lines = file.readlines()
        text = ' '.join(lines)
        content.append(text)
        file.close()

dataset = {'document':names, 'summary': content}

In [6]:
dic_content = {}
for i in range(len(content)):
  dic_content[content[i]] = names[i]


In [7]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,document,summary
0,1383.txt,A sensory deprivation tank cuts a person off f...
1,712.txt,A goiter refers to an enlarged thyroid gland. ...
2,624.txt,"New research, appearing in the International J..."
3,878.txt,"During the early months of pregnancy, it is no..."
4,1553.txt,Asthma causes airway inflammation and difficul...
...,...,...
1984,936.txt,Physical therapy can help people regain moveme...
1985,77.txt,"People with an elevated resting heart rate, me..."
1986,559.txt,Nut allergies are among the most common food a...
1987,681.txt,"Antiretroviral therapy may soon be obsolete, a..."


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
passage_embeddings = list(model.encode(df['summary'].to_list(), show_progress_bar=True))

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [10]:

# Define a function to find relevant articles based on a given query
def find_relevant_info(query):
    # Encode the query using the sentence transformer model
    query_embedding = model.encode(query)
    # Print the shape of the query embedding
    query_embedding.shape

    # Calculate the cosine similarity between the query embedding and the passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Find the indices of the top 3 most similar passages
    top_indicies = torch.topk(similarities.flatten(), 3).indices

    # Get the top 3 relevant passages by slicing the summaries at 200 characters and adding an ellipsis
    top_relevant_passages = [df.iloc[x.item()]['summary'][:] for x in top_indicies]

    # Return the top 3 relevant passages
    return top_relevant_passages

In [11]:
def get_document_names(top_relevant_documents, dic_documents) -> list:
  document_file_names = []
  for e in top_relevant_documents:
    document_file_names.append(dic_content.get(e))
  return document_file_names

In [12]:
result_query_one = find_relevant_info('Find articles about sex in teeneagers')

  b = torch.tensor(b)


In [13]:
get_document_names(result_query_one, dic_content)


['2.txt', '487.txt', '1330.txt']

In [14]:
result_query_two = find_relevant_info('Find articles about Mental health')

In [15]:
get_document_names(result_query_two, dic_content)


['1544.txt', '1535.txt', '1551.txt']

In [16]:
result_query_three = find_relevant_info('Find articles about Post natal Depression')

In [17]:
get_document_names(result_query_three, dic_content)


['142.txt', '164.txt', '1495.txt']

In [18]:
result_query_four = find_relevant_info('Find articles about Postnatal Depression')

In [19]:
get_document_names(result_query_four, dic_content)


['164.txt', '142.txt', '1429.txt']

Text Generation

In [20]:
!pip install datasets



In [21]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import Dataset, DatasetDict

In [22]:
MODEL_NAME = "microsoft/DialoGPT-medium"

In [23]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [24]:
def tokenize_dataset(dataset):

    tokenized = {}
    input_ids = []
    attention_masks = [ ]
    for e in dataset['summary']:
        tokens = tokenizer(e, truncation=True, max_length=128)
        input_ids.append(tokens['input_ids'])
        attention_masks.append(tokens['attention_mask'])
    tokenized['input_ids'] = input_ids
    tokenized['attention_mask'] = attention_masks
    return tokenized

In [25]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.33)

In [26]:
train_dataset = train
test_dataset = test

In [27]:
tokenizer.pad_token = tokenizer.eos_token

In [28]:
train_dataset_tokens = tokenize_dataset(train_dataset)
test_dataset_tokens = tokenize_dataset(test_dataset)

In [29]:
train_dataset_ = Dataset.from_dict(pd.DataFrame(train_dataset_tokens))
test_dataset_ = Dataset.from_dict(pd.DataFrame(test_dataset_tokens))

In [30]:
model_text_gen = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [31]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [32]:
!pip install accelerate -U




In [33]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/model/gpt/workshop/text-gen/',
    num_train_epochs=1, #To keep things fast
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16
)

In [34]:
#Will do all the heavy lifting
trainer = Trainer(
    model=model_text_gen,
    args=training_args,
    train_dataset=train_dataset_,
    eval_dataset=test_dataset_,
    data_collator=data_collator,
)

In [35]:
trainer.train()
trainer.save_model()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [36]:
model_text_gen_ = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/model/gpt/workshop/text-gen/').to('cuda')

In [37]:
def generate_text(prompt, model_):
  inputs = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
  outputs = model_.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text[:generated_text.rfind('.')+1]

In [38]:
generate_text('Bacteria', model_text_gen_)

'Bacteria can survive in the environment and can survive in the body. The body is a part of the environment. The body is a part of the environment. The environment is a part of the body. The environment is a part of the body. The body is a part of the environment.'

GPT 3.5 turbo

In [41]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [43]:
!pip install langchain
!pip install openai
!pip3 install flask-sqlalchemy
!pip install chromadb
!pip install tiktoken
!pip install unstructured

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.1
Collecting flask-sqlalchemy
  Downloading flask_sqlalchemy-3.1.1-py3-none-any.whl (25 kB)
Installing collected packages: flask-sqlalchemy
Successfully installed flask-sqlalchemy-3.1.1
Collecting chromadb
  Downloading chromadb-0.4.14-py3-none-any.whl (448 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.1/448.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.103.2-py3

In [44]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chat_models import ChatOpenAI
import os

In [45]:
key ='key'

In [46]:
os.environ['OPENAI_API_KEY'] = key

Create context

In [49]:
context = ''.join(content)

In [50]:
context_file_path = base_path + "context.txt"

In [51]:
context_file_path

'/content/drive/MyDrive/Colab Notebooks/dataset/medical/context.txt'

In [53]:
with open(context_file_path, 'w',encoding='utf8') as f:
    f.write(context)

In [54]:
loader = TextLoader(context_file_path, encoding='utf8')

index = VectorstoreIndexCreator().from_loaders([loader])



In [55]:
print(index.query('What is depression?', llm=ChatOpenAI()))

Depression is a mood disorder characterized by persistent feelings of sadness and a loss of interest in activities. It is different from normal sadness or the occasional "blues" because it lasts for longer periods of time and significantly impacts a person's ability to function. Depression can be caused by various factors, including genetic features, changes in neurotransmitter levels in the brain, environmental factors, psychological and social factors, and the presence of other medical conditions. It is a common condition, affecting millions of people worldwide. Symptoms of depression can include a depressed mood, loss of interest or pleasure in activities, changes in appetite and weight, sleep disturbances, fatigue, restlessness, and slowed movement and speech. Depression is treatable, and management typically involves a combination of support, therapy, and sometimes medication.
