In [1]:
import numpy as np
import pandas as pd

In [2]:
# read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfnonpro = data2df('HealthProNonPro/NonPro/', 0) # NEG
dfpro = data2df('HealthProNonPro/Pro/', 1) # POS

df = pd.concat([dfpro, dfnonpro], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
1152,ans1547.txt,There are many possible reasons for coughing o...,1
839,ans307.txt,First generation antihistamines are considered...,1
1344,ans1627.txt,Though there is contraindication to taking the...,1
732,a24564.txt,Children should be at least 3 years old before...,0
584,ans260.txt,Either a tendinitis or a bursitis of the hip m...,1
865,ans1148.txt,Thank you for your question. If you are lookin...,1
1614,a7437.txt,The best diet is the one that works best for y...,0
971,a7336.txt,Gotta whear braces somtime,0
1039,a24768.txt,drink a lot(water),0
20,a31641.txt,bcoz they dont hav enough time 2 spend wid the...,0


In [3]:
dfpro.shape
dfnonpro.shape

(1787, 3)

In [4]:
dfnonpro.shape

(1787, 3)

In [5]:
df.shape

(3661, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3661 entries, 0 to 1786
Data columns (total 3 columns):
file     3661 non-null object
text     3661 non-null object
class    3661 non-null int64
dtypes: int64(1), object(2)
memory usage: 114.4+ KB


In [7]:
# Setup the data for Training/Testing. Use 20% for testing
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [8]:
# Use Spacy to preprocess the data. Explore and pick appropriate preprocessing steps.
def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 #only preserve tokens that are 2 or more characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve specific pos
                                    #token.text in nlp.vocab and # check if token in vocab
                                    #token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    #not token.is_punct and # get rid of tokens that are punctuations
                                    #not token.is_space and # get rid of tokens that are spaces
                                    #not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)


import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus)
Xtrain.head(10)

0    carotid artery disease be cause by the build u...
1                                          baking soda
2    of course -pron- could break otherwise how do ...
3    the antibiotic should start show result in 24 ...
4    -pron- could have the male equivalent of yeast...
5    want to travel the entire world and will do -p...
6    -pron- be call histoplasmosis -pron- condition...
7     to find cure for all illness physical and mental
8    honestly hon this be not -pron- fault why shou...
9    speed be an amphetamine psychostimulant which ...
dtype: object

In [9]:
corpus1 = nlp.pipe(list(Xtest))
clean_corpus1 = [custom_tokenizer(doc) for doc in corpus1]
Xtest = pd.Series(clean_corpus1)
Xtest.head(10)

0                                     eat chicken soop
1    thank -pron- for -pron- question now that -pro...
2    tylenol be the safe pain medication -pron- can...
3    try have and orgasm before have actual sex the...
4    -pron- do not need sugar as part of -pron- dai...
5    -pron- be difficult for -pron- to say whether ...
6    the measure that would be helpful in relieve -...
7    -pron- opinion yes that the good answer for th...
8    thank -pron- for -pron- question any type of s...
9                              not possible in week ..
dtype: object

In [10]:
# L1-Normalized Term Frequency
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

transformer = Pipeline(steps = [('tv',TfidfVectorizer(binary=False, norm='l1', # tf - l1 norm
                                                      use_idf=False, smooth_idf=False, # idf - none
                                                      lowercase=True, stop_words='english', 
                                                      min_df=1, max_df=1.0, max_features=None, 
                                                      ngram_range=(1, 1))),
                               ('nb',MultinomialNB(alpha=1.0, # laplace add-one smoothing
                                                   fit_prior=True, # learn class prior-probabilities from data
                                                   class_prior=None # none - go with whatever fit-prior says
                                                  ))])

In [11]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'tv__sublinear_tf': ['False', 'True'], # regularization penalty
    'nb__alpha':[0.5,1.0,1.5,2.0,2.5,3.0] # simple imputer strategy
}
gscv = GridSearchCV(transformer, param_grid, cv=4, return_train_score=False)

In [12]:
gscv.fit(Xtrain, ytrain)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tv', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l1', preprocessor=None, smooth_idf=False,
 ...alse,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tv__sublinear_tf': ['False', 'True'], 'nb__alpha': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [13]:
# predict and evaluate combined preprocessing and model pipeline on test data
# predict and evaluate best_estimator_ on test data


ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.931787175989086
[[310  48]
 [  2 373]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.93       358
           1       0.89      0.99      0.94       375

   micro avg       0.93      0.93      0.93       733
   macro avg       0.94      0.93      0.93       733
weighted avg       0.94      0.93      0.93       733



In [14]:
TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()
precision0 = TN/(TN+FN)
precision1 = TP/(FP+TP)
recall0 = TN/(TN+FP)
recall1 = TP/(FN+TP)
flscore0 = 2*(precision0 * recall0) /(precision0 + recall0)
flscore1 = 2*(precision1 * recall1) /(precision1 + recall1)

print((TP + TN) / (TP + FP + FN + TN))
print("[[",TN,FP,"]\n","[",FN,"",TP,"]]")
print("                precision             recall            f1-score\n")
print("             ",precision0,recall0,flscore0)
print("             ",precision1,recall1,flscore1)


0.931787175989086
[[ 310 48 ]
 [ 2  373 ]]
                precision             recall            f1-score

              0.9935897435897436 0.8659217877094972 0.9253731343283582
              0.8859857482185273 0.9946666666666667 0.9371859296482412
