Data Cleaning & Preparation

In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
# Auslesen der CSV-Datei "dataset.csv" zur weiteren Verarbeitung im Code
df = pd.read_csv('dataset.csv')

In [None]:
# Ausfüllen von NAN-Werten
df = df.fillna('')

In [None]:
# Definition von unabhängigen Variablen und abhängigen Variablen
# Die unabhängigen Variablen sind in den Spalten "feature_columns"
# Die unabhängigen Variablen sind in den Spalten "target_columns"
target_columns = 'Disease'
feature_columns = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
                   'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']

In [None]:
# Lowercasing aller Werte in den target_columns (die restlichen sind alle bereits klein geschrieben)
def str_lower(s):
  return s.lower()

df[target_columns] = df[target_columns].apply(str_lower)

In [None]:
# Entfernung von überflüssigen Leerzeichen (da wo Leerzeichen notwendig sind wird die Funktion nicht angewendet)
def remove_space(s):
    return s.replace(' ', '')

for col in feature_columns:
    df[col] = df[col].apply(remove_space)

In [None]:
# Ersetzen von Unterstrichen durch Leerzeichen
def remove_punct(s):
  return s.replace('_', ' ')

for col in feature_columns:
  df[col] = df[col].apply(remove_punct)

In [None]:
# Dataframe in JSON-Datei schreiben
df.to_json('symptoms.json', orient='index')

In [None]:
# Dataframe auslesen
import json

with open('symptoms.json', 'r+') as f:
  symptoms = json.load(f)

In [None]:
# Alle Item in Symptoms in Listen überführen
sympt = []
for i in symptoms:
  sympt.append([j for j in symptoms[str(i)].values() if j != ''])

In [None]:
# Bildung der Intents
all_data = []
for s_list in sympt:
    tmp_dict = dict()

    tmp_dict['tag'] = 'ask for ' + s_list[0]
    tmp_list = ['The patient is having ' + i for i in s_list[1:]]
    tmp_dict['patterns'] = tmp_list
    tmp_response = 'Are there further symptoms like '
    tmp_dict['responses'] = []
    tmp_loop = s_list[1:]
    for i, j in enumerate(tmp_loop):
        if i == 0:
            tmp_str = ', '.join(tmp_loop[i+1:])
        elif i>0 or i < len(tmp_loop)-1:
            tmp_str = ', '.join(tmp_loop[:i]) + ', '
            tmp_str += ', '.join(tmp_loop[i+1:])
        else:
            tmp_str = ', '.join(tmp_loop[:i-1])
        tmp_response += tmp_str + '?'
        tmp_dict['responses'].append(tmp_response)
        tmp_str = ''
        tmp_response = 'Are there further symptoms like '
    all_data.append(tmp_dict)
all_data

In [None]:
# Bildung der Intents
for sy_list in sympt:
    dict_tmp = dict()
    dict_tmp['tag'] = 'respond for ' + sy_list[0]
    dict_tmp['patterns'] = []
    pattern_tmp = 'The patient is experiencing '
    loop_tmp = sy_list[1:]
    for i, j in enumerate(loop_tmp):
        if i == 0:
            str_tmp = ', '.join(loop_tmp[i+1:])
            pattern_tmp += j + ' in combination with ' + str_tmp + '.'
        elif i>0 or i < len(loop_tmp)-1:
            str_tmp = ', '.join(loop_tmp[:i]) + ', '
            str_tmp += ', '.join(loop_tmp[i+1:])
            pattern_tmp += j + ' in combination with ' + str_tmp + '.'
        else:
            str_tmp = ', '.join(loop_tmp[:i-1])
            pattern_tmp += j + ' in combination with ' + str_tmp + '.'
        dict_tmp['patterns'].append(pattern_tmp)
        str_tmp = ''
        pattern_tmp = 'The patient is experiencing '
    dict_tmp['responses'] = ['The patient might be sick of ' + sy_list[0]]
    all_data.append(dict_tmp)

In [None]:
# Bildung der Intents
dict_greetings = dict()
dict_greetings['tag'] = 'greetings'
dict_greetings['patterns'] = ['Hello', 'Hi there', 'Good morning', 'Good afternoon']
dict_greetings['responses'] = ['Hi, I\'m Botmedix. How can I help you?', 'My name is Botmedix, what can I do for you?']
all_data.append(dict_greetings)

In [None]:
# Bildung der Intents
dict_go = dict()
dict_go['tag'] = 'going off'
dict_go['patterns'] = ['Bye', 'I\'m going, see you!', 'Talk to you tomorrow']
dict_go['responses'] = ['Bye, have a nice rest of the day']
all_data.append(dict_go)

In [None]:
# Bildung der Intents
dict_nothing = dict()
dict_nothing['tag'] = 'unknown'
dict_nothing['patterns'] = []
dict_nothing['responses'] = ['Sorry, I can\'t understand you.', 'Could you repeat, please?']
all_data.append(dict_nothing)

In [None]:
# Vorbereitung für den JSON-Export
data = dict()
data['intents'] = all_data

In [None]:
# Intents in JSON-Datei schreiben
with open('dataset.json', 'w') as out:
  json.dump(data, out)

AI

In [None]:
# Notwendige NLTK-Packages importieren und herunterladen
import nltk

nltk.download('wordnet')
nltk.download('punkt')

In [None]:
from nltk.stem import WordNetLemmatizer
import numpy as np
import tensorflow as tf
import json
import pickle
import random

In [None]:
# Intents auslesen
import json

with open('dataset.json', 'r+') as f:
  data = json.load(f)

In [None]:
# Vorbereitung des Vokabulars
lemmatizer = WordNetLemmatizer()

words = []
classes = []
documents = []
ignore_letters = ['?', '!', '.', ',']

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

for intent in data['intents']:
  for pattern in intent['patterns']:
    word_list = nltk.word_tokenize(pattern)
    words.extend(word_list)
    documents.append((word_list, intent['tag']))
    if intent['tag'] not in classes:
      classes.append(intent['tag'])

In [None]:
# Vorbereitung des Vokabulars und Export in Pickle-Dateien
words = [lemmatizer.lemmatize(w) for w in words if w not in ignore_letters and w not in stop_words]
words = sorted(set(words))
classes = sorted(set(classes))

pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [None]:
data['intents'][11500]

In [None]:
# Durchführung des One-Hot-Encodings auf Trainingsdaten
training = []
output_empty = [0] * len(classes)

for doc in documents:
  bag = []
  word_patterns = doc[0]
  word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]

  for word in words:
    bag.append(1) if word in word_patterns else bag.append(0)

  output_row = list(output_empty)
  output_row[classes.index(doc[1])] = 1
  training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)

X = list(training[:, 0])
y = list(training[:, 1])

In [None]:
len(training)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adamax, Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Bildung des neuronalen Netzes
model = Sequential()
model.add(Dense(128, input_shape = (len(X[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(y[0]), activation='softmax'))

In [None]:
# Bildung des neuronalen Netzes
# Optimierer und Callbacks instanziieren
adamax = Adamax()
early_stopping = EarlyStopping(monitor='accuracy', min_delta=1e-7, patience=20)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=adamax, metrics=['accuracy'])

In [None]:
hist = model.fit(np.array(X), np.array(y), epochs=200*4, batch_size=8*8, verbose=1, callbacks=early_stopping)

In [None]:
# Parameter Finetuning
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adamax
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Definieren der Modellarchitektur
def create_model(num_units1=128, dropout1=0.5, num_units2=64, dropout2=0.5,
                 num_units3=32, dropout3=0.5, num_units4=64, dropout4=0.5,
                 activation='relu', optimizer='adamax', loss='categorical_crossentropy'):
    model = Sequential()
    model.add(Dense(num_units1, input_shape=(len(X[0]),), activation=activation))
    model.add(Dropout(dropout1))
    model.add(Dense(num_units2, activation=activation))
    model.add(Dropout(dropout2))
    model.add(Dense(num_units3, activation=activation))
    model.add(Dropout(dropout3))
    model.add(Dense(num_units4, activation=activation))
    model.add(Dropout(dropout4))
    model.add(Dense(len(y[0]), activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

# Erstellen eines KerasClassifiers mit der Modellfunktion
model = KerasClassifier(build_fn=create_model, verbose=0)

# Definieren des zu durchsuchenden Hyperparametergitters
param_grid = {
    'num_units1': [64, 128, 256],
    'dropout1': [0.2, 0.3, 0.5],
    'num_units2': [32, 64, 128, 256],
    'dropout2': [0.2, 0.3, 0.5],
    'num_units3': [16, 32, 64, 128],
    'dropout3': [0.2, 0.3, 0.5],
    'num_units4': [32, 64, 128],
    'dropout4': [0.2, 0.3, 0.5],
    'activation': ['relu', 'elu', 'tanh', 'selu'],
    'optimizer': ['adamax', 'adam', 'sgd'],
    'loss' : ['categorical_crossentropy', 'binary_crossentropy', 'sparse_categorical_crossentropy']
}

# Erstellen eines GridSearchCV-Objekts und Anpassung mit den Trainingsdaten
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(np.array(X), np.array(y))

# Die besten Hyperparameter ausgeben
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Neuronales Netz speichern für Vorhersagen
model.save('botmedix.h5', hist)

Load pretrained model

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# Intents auslesen
with open('dataset.json', 'r+') as f:
  data = json.load(f)

In [None]:
from tensorflow.keras.models import load_model

In [None]:
# Vokabular auslesen und Modell laden
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
botmedix = load_model('botmedix.h5')

In [None]:
botmedix.summary()

In [None]:
botmedix.evaluate(X, y)

In [None]:
# Tokenizing des User Inputs
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['The', 'patient', 'suffering'])
def clean_up_sentence(sentence, stop_words=stop_words):
  sentence_words = nltk.word_tokenize(sentence)
  sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words if word not in stop_words]
  return sentence_words

In [None]:
# Bag of Words mit den Tokens bilden
def bag_of_words(sentence):
  sentence_words = clean_up_sentence(sentence)
  bag = [0] * len(words)
  for w in sentence_words:
    for i, word in enumerate(words):
      if word == w:
        bag[i] = 1
  return np.array(bag)

In [None]:
# User-Input erhalten -> Bag-of-Words berechnen -> Vorhersagen treffen und vorbereiten für die Antwort des Chatbots
def predict_class(sentence):
  bow = bag_of_words(sentence)
  res = botmedix.predict(np.array([bow]))[0]
  ERROR_THRESHOLD = 0.25
  results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]

  results.sort(key=lambda x : x[1], reverse=True)
  return_list = []
  for r in results:
    return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
  return return_list

In [None]:
# Antwort des Chatbots zurückgeben
def get_response(intents_list, intents_json):
  list_of_intents = intents_json['intents']
  if len(intents_list) > 1:
      tag = [i['intent'] for i in intents_list]
      prob = [i['probability'] for i in intents_list]
      result = []
      for i, k in enumerate(tag):
          for j in list_of_intents:
              if j['tag'] == k:
                  response_ = random.choice(j['responses'])
                  if response_.startswith('Are there further symptoms like'):
                    result.append(response_)
                  else:
                    result.append(response_ + ' with the probability of ' + prob[i])
                  break
  else:
      tag = intents_list[0]['intent']
      prob = intents_list[0]['probability']
      for i in list_of_intents:
          if i['tag'] == tag:
            response_ = random.choice(i['responses'])
            if response_.startswith('Are there further symptoms like'):
                result = response_
            else:
                result = response_ + ' with the probability of ' + prob
            break
  return result

In [None]:
# Testing im Backend ob es funktioniert
print("Go! Botmedix is working")

while True:
  message = input("")
  print(message)
  ints = predict_class(message)
  res = get_response(ints, data)
  if type(res) == list: 
    for r in res: print (r)
  if type(res) == str:print(res)