In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import pandas as pd
import numpy as np
import math
import re
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import os

## To follow up on the previous model on detecting Covid-19 on Chest X-Rays and trying to come up with other helpful solutions to improve people's lives during these hard times, we will focus on another aspect of this Pandemic - Mental Health. The intent here is to build a question and answer transformer model to answer people's questions in  regards to mental health. Mental Health is another crucial component of overall well being and many people are likely to show symptoms or exacerbate existing symptoms during the pandemic due to periods of paranoia, extended isolation, etc. It would be helpful to have an easily accessible chatbot or question and answer model, possibly through a website interface, that can provide interactive answers to mental health questions from users. The key of this project is to show the potential of a transformer solution for this problem, for which a real solution would have to be vetted by a mental health/medical and data science teams. The project uses the paper "Attention Is All You Need" by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin or ArXiv.org submitted on 12 Jun 2017 (v1), last revised 6 Dec 2017, v5 to create the question and answer model. <br><br>

## The Dataset consists of 98 question and answer pairs and was prepared by https://www.kaggle.com/narendrageek with the following acknowledgements:
* https://www.thekimfoundation.org/faqs/
* https://www.mhanational.org/frequently-asked-questions
* https://www.wellnessinmind.org/frequently-asked-questions/
* https://www.heretohelp.bc.ca/questions-and-answers

In [3]:
# mount drive

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
# It's a zip file so we will use the appropriate python tools

import zipfile

with zipfile.ZipFile("/gdrive/MyDrive/archive.zip","r") as zip_ref:
   zip_ref.extractall('/gdrive/MyDrive/archive/Mental_Health_FAQ.csv')

In [5]:
data_path = '/gdrive/MyDrive/archive/Mental_Health_FAQ.csv/'
fileName = 'Mental_Health_FAQ.csv'

In [6]:
# Use OS library to open

df = pd.read_csv(os.path.join(data_path, fileName), encoding = None)
 

In [7]:
# Examine first five entries

df.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,2110618,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,6361820,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,9434130,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,7657263,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [8]:
# Check file types, number of rows, etc.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question_ID  98 non-null     int64 
 1   Questions    98 non-null     object
 2   Answers      98 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [9]:
# Take a look at the answers which have more content

df['Answers'][0]

'Mental illnesses are health conditions that disrupt a personâ€™s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a personâ€™s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condition due to the brainâ€™s biology.\nSimilarly to how one would treat diabetes with medication and i

In [10]:
# There are some encoding errors we are going to need to fix


df['Answers'] = df['Answers'].map(lambda x: x.encode('ascii', errors = 'replace').decode('utf-8'))
df['Answers'][0]

'Mental illnesses are health conditions that disrupt a person???s thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life.\nMental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital.\nIt is important to know that mental illnesses are medical conditions that have nothing to do with a person???s character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condition due to the brain???s biology.\nSimilarly to how one would treat diabetes with medication and i

In [11]:
# Now that the characters are utf-8, let's correct the errors by replacing the strings

df['Answers'] = df['Answers'].map(lambda x: x.replace('\n', ' '))

In [12]:
df['Answers'] = df['Answers'].map(lambda x: x.replace("???", "'"))

In [13]:
df['Answers'][0]

"Mental illnesses are health conditions that disrupt a person's thoughts, emotions, relationships, and daily functioning. They are associated with distress and diminished capacity to engage in the ordinary activities of daily life. Mental illnesses fall along a continuum of severity: some are fairly mild and only interfere with some aspects of life, such as certain phobias. On the other end of the spectrum lie serious mental illnesses, which result in major functional impairment and interference with daily life. These include such disorders as major depression, schizophrenia, and bipolar disorder, and may require that the person receives care in a hospital. It is important to know that mental illnesses are medical conditions that have nothing to do with a person's character, intelligence, or willpower. Just as diabetes is a disorder of the pancreas, mental illness is a medical condition due to the brain's biology. Similarly to how one would treat diabetes with medication and insulin, m

In [14]:
df['Questions'][10]

'How can I find a mental health professional right for my child or myself?'

In [15]:
# function to format characters and punctuation for processing

def preprocess_sentence(sentence):

    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r"[^a-z?.!,']+", " ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    
    return sentence.strip()

In [16]:
# Add start and end tokens to sentences

df['Questions'] = ["<start> " + preprocess_sentence(sentence) + " <end>" for sentence in df['Questions'].values.tolist()]
df['Answers'] = ["<start> " + preprocess_sentence(sentence) + " <end>" for sentence in df['Answers'].values.tolist()]

In [18]:
df['Questions'][0]

'<start> what does it mean to have a mental illness ? <end>'

In [19]:
df['Answers'][0]

"<start> mental illnesses are health conditions that disrupt a person's thoughts , emotions , relationships , and daily functioning . they are associated with distress and diminished capacity to engage in the ordinary activities of daily life . mental illnesses fall along a continuum of severity some are fairly mild and only interfere with some aspects of life , such as certain phobias . on the other end of the spectrum lie serious mental illnesses , which result in major functional impairment and interference with daily life . these include such disorders as major depression , schizophrenia , and bipolar disorder , and may require that the person receives care in a hospital . it is important to know that mental illnesses are medical conditions that have nothing to do with a person's character , intelligence , or willpower . just as diabetes is a disorder of the pancreas , mental illness is a medical condition due to the brain's biology . similarly to how one would treat diabetes with 

In [20]:
questions = df['Questions'].values.tolist()

In [21]:
# We will tokenize all sentences (convert to numbers so that we can use them for prediction)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None, filters='', # list of characters
                                                  lower=True)                 # to filter is empty
tokenizer.fit_on_texts(questions)                                                # string

questions_sequence = tokenizer.texts_to_sequences(questions)

In [22]:
len(tokenizer.word_counts)

266

In [None]:
questions_sequence[10]

[1, 7, 4, 3, 9, 6, 8, 16, 68, 119, 18, 22, 49, 19, 120, 2]

In [None]:
len(questions)

98

In [23]:
VOCAB_SIZE_QUESTIONS = len(tokenizer.word_counts) + 2

In [24]:
answers = df['Answers'].values.tolist()

In [25]:
# Tokenizer for answers

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None, filters='', # list of characters
                                                  lower=True)                 # to filter is empty
tokenizer.fit_on_texts(answers)                                                # string

answers_sequence = tokenizer.texts_to_sequences(answers)

In [26]:
len(tokenizer.word_counts)

2706

In [27]:
VOCAB_SIZE_ANSWERS = len(tokenizer.word_counts) + 2

In [31]:
answers_sequence[10]

[42,
 201,
 601,
 17,
 6,
 121,
 7,
 9,
 13,
 173,
 15,
 278,
 17,
 15,
 602,
 3,
 6,
 1194,
 8,
 13,
 60,
 1,
 220,
 6,
 121,
 49,
 185,
 801,
 13,
 279,
 23,
 470,
 39,
 165,
 1,
 43]

In [29]:
len(answers)

98

In [32]:
# Create padding so that we keep the sequences at the same length and establish a max length
MAX_LENGTH = 20

questions = tf.keras.preprocessing.sequence.pad_sequences(questions_sequence,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
answers = tf.keras.preprocessing.sequence.pad_sequences(answers_sequence,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [33]:
answers[0]

array([1579,   60,    2,   24,   15,  119,  254,   12,    5,  111,   17,
         20,   59,    3,   22,  932,    4, 1580,    1,   43], dtype=int32)

In [35]:
# Create the dataset, batch size and improve accessibility to data during training

BUFFER_SIZE = 200
BATCH_SIZE = 16
dataset = tf.data.Dataset.from_tensor_slices((questions, answers))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [41]:
dataset.element_spec

(TensorSpec(shape=(None, 20), dtype=tf.int32, name=None),
 TensorSpec(shape=(None, 20), dtype=tf.int32, name=None))

In [42]:
path = os.path.join('/gdrive/My Drive/', "attention_data")

In [43]:
tf.data.experimental.save(dataset, path)

In [56]:


new_dataset = tf.data.experimental.load(
    path=path, compression=None, element_spec=dataset.element_spec)

In [58]:
new_dataset.element_spec

(TensorSpec(shape=(None, 20), dtype=tf.int32, name=None),
 TensorSpec(shape=(None, 20), dtype=tf.int32, name=None))