In [1]:
import joblib
import pandas as pd
import glob

# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

from helpers.preprocessing import PreProcessing
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier

## Proprocessing and Save Vectorizer

In [None]:
data = pd.read_excel("data/symp.xlsx")
symptoms = data['q_11'].to_numpy()

processing = PreProcessing()
symptoms_preprocess = processing.transform(symptoms)
symptoms_preprocess

In [9]:
vectorizer = CountVectorizer(
    max_df=len(symptoms_preprocess), 
    min_df=0,
    decode_error='ignore',
    binary=True
)

data_vectorizer = pd.DataFrame(
    vectorizer.fit_transform(symptoms_preprocess).toarray(), 
    columns=vectorizer.get_feature_names_out()
)

data_vectorizer['disorder'] =  pd.read_excel('data/data_klasifikasi.xlsx')['disorder']

data_vectorizer.to_csv('data/data_vectorizer.csv', index=False)
joblib.dump(vectorizer, 'modeling/vectorizer.save') 

['modeling/vectorizer.save']

## Make Simple Model

In [2]:
data = pd.read_csv('data/data_vectorizer.csv')

target_column = 'disorder'
X = data.drop(target_column, axis=1).to_numpy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[target_column])

In [3]:
files = glob.glob("report/*")
report = pd.concat([pd.read_csv(file, index_col=0) for file in files]).reset_index(drop=True)
report.columns = report.columns.str.replace('_test', '')

selected_columns = ['clf_name', 'params', 'mean_recall', 'mean_precision', 'mean_f1-score', 'mean_accuracy']
best_recall = report.sort_values(['mean_recall', 'mean_f1-score'], ascending=[False, False])[selected_columns].head().copy()
best_f1 = report.sort_values('mean_f1-score', ascending=False)[selected_columns].head().copy()
best_accuracy = report.sort_values('mean_accuracy', ascending=False)[selected_columns].head().copy()
best_precision = report.sort_values('mean_precision', ascending=False)[selected_columns].head().copy()

best_params_f1 = eval(best_f1.iloc[0, 1])
best_params_recall = eval(best_recall.iloc[1, 1])
best_params_precision = eval(best_precision.iloc[0, 1])
best_params_accuracy = eval(best_accuracy.iloc[0, 1])

clf1 = GradientBoostingClassifier(**best_params_f1)
clf2 = GradientBoostingClassifier(**best_params_recall)
clf3 = GradientBoostingClassifier(**best_params_precision)
clf4 = GradientBoostingClassifier(**best_params_accuracy)

model = VotingClassifier(estimators=[
        ('f1_score', clf1), ('recall', clf2), ('precision', clf3), ('accuracy', clf4)], voting='hard')
    
model.fit(X, y)

VotingClassifier(estimators=[('f1_score',
                              GradientBoostingClassifier(learning_rate=0.0801,
                                                         max_depth=2,
                                                         min_samples_leaf=3,
                                                         min_samples_split=8)),
                             ('recall',
                              GradientBoostingClassifier(learning_rate=0.1401,
                                                         max_depth=2,
                                                         min_samples_leaf=3,
                                                         min_samples_split=8,
                                                         n_estimators=400)),
                             ('precision',
                              GradientBoostingClassifier(learning_rate=0.0201,
                                                         max_depth=2,
                                       

In [4]:
# save model dan encoder
joblib.dump(label_encoder, "modeling/label_encoder.save")
joblib.dump(model, 'modeling/decision_tree_model.pkl')

['modeling/decision_tree_model.pkl']

## Try to Predict New Data

In [6]:
symptoms = "I feel very anxious, sometimes I want to suicide"

vectorizer = joblib.load('modeling/vectorizer.save')
label_encoder = joblib.load('modeling/label_encoder.save')
model = joblib.load('modeling/decision_tree_model.pkl')

preprocessing = PreProcessing()
symptoms_preprocess = preprocessing.transform(symptoms)

X = vectorizer.transform(symptoms_preprocess)
prediction = model.predict(X)
prediction = label_encoder.inverse_transform(prediction)[0]
prediction

b'Skipping line 8: expected 46 fields, saw 56\n'
b'Skipping line 8: expected 25 fields, saw 29\n'


'Bipolar'

In [8]:
data = pd.read_csv('data/data_vectorizer.csv')
data.columns, len(data.columns)

(Index(['addict', 'alcohol', 'anger', 'anxious', 'appetite', 'balance',
        'breathe', 'bulimia', 'communicate', 'concentrate', 'confuse', 'cry',
        'delusion', 'depress', 'digestive', 'distrust', 'dizzy', 'drug', 'eat',
        'echolalia', 'emotion', 'empty', 'excess', 'faint', 'fluctuation',
        'forget', 'guilt', 'harm', 'hatred', 'headache', 'heartbeat',
        'hopeless', 'impulsive', 'insomnia', 'lazy', 'libido', 'lonely', 'mood',
        'nausea', 'numb', 'obsessive', 'overreact', 'panic', 'paranoia',
        'respond', 'sad', 'scare', 'stress', 'suicide', 'sweat', 'tire',
        'trauma', 'tremble', 'violence', 'weight', 'withdrawal', 'worry',
        'disorder'],
       dtype='object'),
 58)

In [9]:
data2 = pd.read_excel('data/data_klasifikasi.xlsx')
data2.columns, len(data2.columns)

(Index(['addict', 'alcohol', 'anger', 'anxious', 'appetite', 'balance',
        'breathe', 'bulimia', 'communicate', 'concentrate', 'confuse', 'cry',
        'delusion', 'depress', 'digestive', 'distrust', 'dizzy', 'drug', 'eat',
        'echolalia', 'emotion', 'empty', 'excess', 'faint', 'fluctuation',
        'forget', 'guilt', 'harm', 'hatred', 'headache', 'heartbeat',
        'hopeless', 'impulsive', 'insomnia', 'lazy', 'libido', 'lonely', 'mood',
        'nausea', 'numb', 'obsessive', 'overreact', 'panic', 'paranoia',
        'respond', 'sad', 'scare', 'stress', 'suicide', 'sweat', 'tire',
        'trauma', 'tremble', 'violence', 'weight', 'withdrawal', 'worry',
        'disorder'],
       dtype='object'),
 58)