# 3_classifier_naive_bayes.ipynb
Builds the disease classification model using Reddit data based on Naive Bayes.

The implementation is based on:
- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
- https://github.com/scikit-learn/scikit-learn/tree/master/doc/tutorial/text_analytics

# Load the dataset

In [1]:
import pandas as pd
import numpy as np
import os
from pprint import pprint

def get_target_int(subreddit):
    if subreddit == 'cancer':
        return int(0)
    if subreddit == 'COVID19positive':
        return int(1)
    if subreddit == 'diabetes':
        return int(2)
    if subreddit == 'eczema':
        return int(3)
    if subreddit == 'eyetriage':
        return int(4)
    if subreddit == 'GERD':
        return int(5)
    if subreddit == 'STD':
        return int(6)

    return int(7)

input_data_folder = 'data_combination'
combined_data_file = input_data_folder + '/' + 'Combined.csv'

df = pd.read_csv(combined_data_file)
print(df.shape)

X = (df['Title'] + ' ' + df['Body']).values
X= X.astype(str)

df['Target'] = df.apply(lambda row: get_target_int(row['Subreddit']), axis=1)
pprint(df['Target'])

(43771, 17)
0        0
1        0
2        0
3        0
4        0
        ..
43766    6
43767    6
43768    6
43769    6
43770    6
Name: Target, Length: 43771, dtype: int64


# Extract features from text files
- Use backs of words representation
- Tokenize text
- Convert to frequencies with tf (Term Frequencies) and tf-idf (Term Frequency times Inverse Document Frequenccy"

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

(43771, 55411)

In [3]:
count_vect.vocabulary_.get(u'tooth')

50325

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(43771, 55411)

In [5]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(43771, 55411)

# Train the Naive Bayes classifier

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, df['Target'])

In [7]:
docs_new = ['I think COVID-19 is likely. Short of breath', 'I need cancer treatment']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print(doc + '->' + str(category))

I think COVID-19 is likely. Short of breath->1
I need cancer treatment->0


In [8]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [9]:
text_clf.fit(X, df['Target'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [10]:
predicted = text_clf.predict(X)
np.mean(predicted == df['Target'])

0.8837586529894222

In [11]:
target_names = ['cancer', 'COVID19positive', 'diabetes', 'eczema', 'eyetriage', 'GERD', 'STD']
from sklearn import metrics
print(metrics.classification_report(df['Target'], predicted,
    target_names=target_names, digits=4))

                 precision    recall  f1-score   support

         cancer     0.9181    0.8825    0.8999      6085
COVID19positive     0.9902    0.7003    0.8204      4921
       diabetes     0.9705    0.8880    0.9274      6695
         eczema     0.9242    0.9353    0.9297      6983
      eyetriage     0.9940    0.3170    0.4807      2082
           GERD     0.7422    0.9963    0.8507     10657
            STD     0.9737    0.9630    0.9683      6348

       accuracy                         0.8838     43771
      macro avg     0.9304    0.8118    0.8396     43771
   weighted avg     0.9040    0.8838    0.8779     43771



In [12]:
metrics.confusion_matrix(df['Target'], predicted)

array([[ 5370,     6,    32,    55,     1,   578,    43],
       [  193,  3446,    15,    44,     1,  1182,    40],
       [   99,    17,  5945,    56,     2,   561,    15],
       [   43,     3,    23,  6531,     0,   368,    15],
       [  111,     1,    87,   321,   660,   856,    46],
       [   10,     2,    11,    10,     0, 10618,     6],
       [   23,     5,    13,    50,     0,   144,  6113]])