<a href="https://colab.research.google.com/github/danielsyahputra13/ml_capstone/blob/master/notebooks/Prediksi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os

os.chdir("/content/drive/Shareddrives/ML-Capstone/")
os.getcwd()

'/content/drive/Shareddrives/ML-Capstone'

In [None]:
import re
from typing import List

import spacy
from spacy.tokens import Doc
from tqdm import tqdm


class SpacyPreprocessor:
    def __init__(
        self,
        spacy_model=None,
        remove_numbers=False,
        remove_special=True,
        pos_to_remove=None,
        remove_stopwords=False,
        lemmatize=False,
    ):
        """
        Preprocesses text using spaCy
        :param remove_numbers: Whether to remove numbers from text
        :param remove_stopwords: Whether to remove stopwords from text
        :param remove_special: Whether to remove special characters (including numbers)
        :param pos_to_remove: list of PoS tags to remove
        :param lemmatize:  Whether to apply lemmatization
        """

        self._remove_numbers = remove_numbers
        self._pos_to_remove = pos_to_remove
        self._remove_stopwords = remove_stopwords
        self._remove_special = remove_special
        self._lemmatize = lemmatize

        if not spacy_model:
            self.model = spacy.load("en_core_web_sm")
        else:
            self.model = spacy_model

    @staticmethod
    def download_spacy_model(model="en_core_web_sm"):
        print(f"Downloading spaCy model {model}")
        spacy.cli.download(model)
        print(f"Finished downloading model")

    @staticmethod
    def load_model(model="en_core_web_sm"):
        return spacy.load(model, disable=["ner", "parser"])

    def tokenize(self, text) -> List[str]:
        """
        Tokenize text using a spaCy pipeline
        :param text: Text to tokenize
        :return: list of str
        """
        doc = self.model(text)
        return [token.text for token in doc]

    def preprocess_text(self, text) -> str:
        """
        Runs a spaCy pipeline and removes unwanted parts from text
        :param text: text string to clean
        :return: str, clean text
        """
        doc = self.model(text)
        return self.__clean(doc)

    def preprocess_text_list(self, texts=List[str]) -> List[str]:
        """
        Runs a spaCy pipeline and removes unwantes parts from a list of text.
        Leverages spaCy's `pipe` for faster batch processing.
        :param texts: List of texts to clean
        :return: List of clean texts
        """
        clean_texts = []
        for doc in tqdm(self.model.pipe(texts)):
            clean_texts.append(self.__clean(doc))

        return clean_texts

    def __clean(self, doc: Doc) -> str:

        tokens = []
        # POS Tags removal
        if self._pos_to_remove:
            for token in doc:
                if token.pos_ not in self._pos_to_remove:
                    tokens.append(token)
        else:
            tokens = doc

        # Remove Numbers
        if self._remove_numbers:
            tokens = [
                token for token in tokens if not (token.like_num or token.is_currency)
            ]

        # Remove Stopwords
        if self._remove_stopwords:
            tokens = [token for token in tokens if not token.is_stop]
        # remove unwanted tokens
        tokens = [
            token
            for token in tokens
            if not (
                token.is_punct or token.is_space or token.is_quote or token.is_bracket
            )
        ]

        # Remove empty tokens
        tokens = [token for token in tokens if token.text.strip() != "" and token.text != 'amp']

        # Lemmatize
        if self._lemmatize:
            text = " ".join([token.lemma_ for token in tokens])
        else:
            text = " ".join([token.text for token in tokens])

        if self._remove_special:
            # Remove non alphabetic characters
            text = re.sub(r"[^a-zA-Z\']", " ", text)
        # remove non-Unicode characters
        text = re.sub(r"[^\x00-\x7F]+", "", text)
        text = re.sub(r"\s{2,}", "", text)

        text = text.lower()

        return text

In [None]:
spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=True, remove_stopwords=True)

In [None]:
pred = preprocessor.preprocess_text("""I have been on this birth control for one cycle. After reading some of the reviews on this type and similar birth controls I was a bit apprehensive to start. Im giving this birth control a 9 out of 10 as I have not been on it long enough for a 10. So far I love this birth control! My side effects have been so minimal its like Im not even on birth control! I have experienced mild headaches here and there and some nausea but other than that ive been feeling great! I got my period on cue on the third day of the inactive pills and I had no idea it was coming because I had zero pms! My period was very light and I barely had any cramping! I had unprotected sex the first month and obviously didn&#039;t get pregnant so I&#039;m very pleased! Highly recommend""")

## Load Tokenizer


In [None]:
os.listdir("assets")

['tokenizer.json',
 'encoder.npy',
 'tokenizer_with_counts_100.json',
 'encoder_with_count_100.npy',
 'medicine.pkl',
 'tokenizer_counts_1000.json',
 'encoder_counts_1000.npy']

In [None]:
import tensorflow as tf
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
with open('assets/tokenizer_with_counts_100.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [None]:
vocab_size = 5000
embedding_dim = 100
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [None]:
pred_sequences = tokenizer.texts_to_sequences([pred])
pred_padded = pad_sequences(pred_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print(len(pred_sequences))
print(pred_padded.shape)

1
(1, 100)


In [None]:
pred_padded

array([[  53,   35,  280,  114,  120,  342,  834,   53,   35,  142, 3108,
           9,    1,   50,   53,   35,   47,   73,   82,   53,   35,   10,
         540,   18,    1,   53,   35,   30,  194,   62,   79,   44,    5,
          42,   15,   21,    1,    2, 2665,   20,  653,   58,  779,   21,
         105,  411,  275,    1,   77,    4, 1155,   41,  157,   11,  813,
         295,   70,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

## Load model

In [None]:
os.listdir("model")

['model_daniel.h5',
 'model_mardi1.h5',
 'model_daniel_ver2.h5',
 'model_daniel_ver3.h5',
 'model_daniel_GRU.h5',
 'model_daniel_GRU_ver2.h5',
 'model_laras.h5',
 'GRU_dense.h5',
 'GRU_dense_with_count_100.h5',
 'sentiment_model.h5']

In [None]:
model = tf.keras.models.load_model('model/GRU_dense_with_count_100.h5')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         500000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        63744     
 l)                                                              
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 185)               23865     
                                                                 
Total params: 633,177
Trainable params: 633,177
Non-trainable params: 0
__________________________________________________

In [None]:
result = model.predict(pred_padded)[0]

In [None]:
result

array([9.57393886e-14, 1.52874261e-03, 3.78343393e-05, 2.42825089e-17,
       1.32882042e-19, 1.46924614e-16, 4.54121468e-15, 4.55215424e-17,
       1.33681938e-13, 4.22728390e-20, 1.02434875e-11, 2.90451121e-14,
       9.00340685e-18, 2.06006256e-14, 3.39943962e-09, 3.69539555e-09,
       4.90933738e-10, 5.42407739e-17, 3.87635989e-14, 9.16492793e-13,
       7.06215626e-21, 1.18189204e-17, 2.20384515e-13, 3.32279018e-12,
       3.94589339e-17, 3.03995956e-12, 4.32627718e-12, 8.32797447e-16,
       2.00583278e-12, 4.11488275e-17, 9.41923275e-11, 2.14502527e-10,
       2.21158514e-09, 9.94843960e-01, 1.81847993e-19, 3.87775966e-13,
       2.04929907e-16, 1.33452993e-12, 1.02256583e-16, 5.69150660e-15,
       2.68719317e-17, 3.73559875e-24, 8.64731531e-15, 3.12079984e-12,
       1.14662024e-20, 2.02695846e-12, 5.38853684e-12, 1.09035256e-16,
       4.14468165e-28, 6.48392527e-13, 3.12967832e-14, 4.22328290e-14,
       2.34001351e-18, 6.97482570e-20, 5.63866326e-17, 7.41066809e-17,
      

In [None]:
idx = result.argsort()[-3:][::-1]
print("Penyakit ke -", idx)

Penyakit ke - [ 33 105   1]


In [None]:
result[idx[0]]

0.99484396

## Load encoder

In [None]:
os.listdir("assets")

['tokenizer.json',
 'encoder.npy',
 'tokenizer_with_counts_100.json',
 'encoder_with_count_100.npy',
 'medicine.pkl',
 'tokenizer_counts_1000.json',
 'encoder_counts_1000.npy']

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.classes_ = np.load('assets/encoder_with_count_100.npy', allow_pickle=True)

In [None]:
encoder.inverse_transform(idx)

array(['Birth Control', 'Menstrual Disorders',
       'Abnormal Uterine Bleeding'], dtype=object)

In [None]:
penyakits = encoder.inverse_transform(idx)

In [None]:
penyakits

array(['Birth Control', 'Menstrual Disorders',
       'Abnormal Uterine Bleeding'], dtype=object)

Deskripsi

In [None]:
os.listdir("dataset")

['drugsComTrain_raw.tsv',
 'drugsComTest_raw.tsv',
 'train.csv',
 'test.csv',
 'data.csv',
 'train1.csv',
 'test1.csv',
 'train_cleaned.csv',
 'test_cleaned.csv',
 'drugsComTest_raw (1).gsheet',
 'drugsComTest_raw.gsheet',
 'condition.csv',
 'inquirerbasic.csv',
 'condition.gsheet',
 'condition.xlsx']

In [None]:
description = pd.read_excel("dataset/condition.xlsx")[["Condition", "Deskripsi"]]

In [None]:
description.to_json(orient="records")

'[{"Condition":"ADHD","Deskripsi":"ADHD atau attention deficit hyperactivity disorder adalah gangguan mental yang menyebabkan anak sulit memusatkan perhatian, serta memiliki perilaku impulsif dan hiperaktif, Kondisi ini dapat berdampak pada prestasi anak di sekolah."},{"Condition":"Birth Control","Deskripsi":"Pengaturan kelahiran atau bisa disebut juga kontrasepsi adalah upaya yang diambil dalam mengatur dan mengontrol angka kelahiran bayi dalam masyarakat. Dalam hal ini kontrasepsi ditujukan untuk membentuk keluarga yang bahagia dan sejahtera dengan terpenuhinya semua kebutuhan kesehatan anak-anak dan anggota keluarga lainnya. "},{"Condition":"Opiate Dependence","Deskripsi":"Opioid atau opiat adalah salah satu jenis narkotika yang bersifat depresan, yang berfungsi mengurangi aktifitas fungsional tubuh. Jenis ini membuat pemakainya merasa tenang, pendiam, dan bahkan membuat tidur dan tidak sadarkan diri. Meski disebut sebagai narkotik, obat ini aman digunakan selama dengan resep dokter

In [None]:
# {
#     "result": [
#                   {"name": "Birth Control",
#                    "Description": "",
#                    "Medicine": [XXX, XXX]},
#                   {"name": "Menstrual Disorders",
#                     "Description": ""},
#                   {"name": "Menstrual Disorders",
#                     "Description": ""}
#         ]
# }

In [None]:
description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Condition  185 non-null    object
 1   Deskripsi  180 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB


In [None]:
description.fillna("Belum ada deskripsi", inplace=True)

In [None]:
# description.to_pickle("dataset/description.pkl")

In [None]:
penyakits = penyakits.tolist()

AttributeError: ignored

In [None]:
description[description['Condition'].isin(penyakits)]['Deskripsi']

1     Pengaturan kelahiran atau bisa disebut juga ko...
57    Gangguan menstruasi adalah kelainan yang terja...
61    Perdarahan uterus abnormal adalah adanya perda...
Name: Deskripsi, dtype: object

In [None]:
for penyakit in penyakits:
  print(description[description['Condition'] == penyakit]['Deskripsi'].values[0])

Pengaturan kelahiran atau bisa disebut juga kontrasepsi adalah upaya yang diambil dalam mengatur dan mengontrol angka kelahiran bayi dalam masyarakat. Dalam hal ini kontrasepsi ditujukan untuk membentuk keluarga yang bahagia dan sejahtera dengan terpenuhinya semua kebutuhan kesehatan anak-anak dan anggota keluarga lainnya. 
Gangguan menstruasi adalah kelainan yang terjadi pada siklus menstruasi. Gangguan menstruasi bisa berupa perdarahan menstruasi yang terlalu banyak atau terlalu sedikit, siklus menstruasi tidak teratur, menstruasi terjadi lebih dari 7 hari, tidak menstruasi lebih dari 3 bulan, atau bahkan tidak pernah haid sama sekali. Gangguan menstruasi juga bisa disertai dengan keluhan berat, seperti nyeri dan kram parah, hingga depresi menjelang menstruasi.
Perdarahan uterus abnormal adalah adanya perdarahan hebat atau perdarahan yang tidak biasanya dari uterus keluar melalui Miss V. Perdarahan ini dapat terjadi kapan saja saat siklus menstruasi atau di luar siklus menstruasi.


## TEST

In [None]:
!pip install translators

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import translators as ts


Using United States server backend.


In [None]:
ts.google("saya pusing mual-mual muntah", to_language='en')

"I'm dizzy nausea and vomiting"