In [0]:
'''set the environment'''
#import needed packages
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
import fnmatch

#data path
path = ""


#create and set database
# spark.sql(f"DROP DATABASE IF EXISTS {database} CASCADE")
# spark.sql(f"CREATE DATABASE {database}")
spark.sql(f"USE {database}")

In [0]:
!pip install gensim

In [0]:
import numpy as np
import pandas as pd
import itertools

import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#import classifiers from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#import different metrics to evaluate the classifiers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn import metrics

#matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 

In [0]:
df = spark.table('textTable').toPandas()
print('textTable shape:', df.shape)
df.head()

In [0]:
#Let us take the top 3 categories and leave out the rest.
shortlist = [1, 0]
classes = ['no contraindication', 'contraindication']
df_subset = df[df['label'].isin(shortlist)]
df_subset.shape

In [0]:
#strip_handles removes personal information such as twitter handles, which don't
#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.
tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)
mystopwords = set(stopwords.words("english"))

#Function to tokenize tweets, remove stopwords and numbers. 
#Keeping punctuations and emoticon symbols could be relevant for this task!
def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that removes stopwords and digits from a list of tokens
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]

#df_subset contains only the three categories we chose. 
mydata = preprocess_corpus(df_subset['text'])
mycats = df_subset['label']
print(len(mydata), len(mycats))

In [0]:
#Split data into train and test, following the usual process
train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)

#prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

In [0]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")
#infer in multiple steps to get a stable representation. 
train_vectors =  [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_data]
test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_data]

In [0]:
#Evaluate the classifier using various measures

# Function to plot confusion matrix. 
# Ref:http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label',fontsize=15)
    plt.xlabel('Predicted label',fontsize=15)

In [0]:
#logistic regression
lrClass = LogisticRegression(class_weight="balanced") #because classes are not balanced. 
lrClass.fit(train_vectors, train_cats)

preds = lrClass.predict(test_vectors)

# print classification report and accuracy:
print(classification_report(test_cats, preds))
print("Accuracy: ", accuracy_score(test_cats, preds))
    
# print the confusion matrix
cnf_matrix = confusion_matrix(test_cats, preds)
plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=classes, #normalize=True,
                      title='Confusion matrix with all features')

# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:
#Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic
pred_prob = lrClass.predict_proba(test_vectors)[:, 1]
print("ROC_AOC_Score: ", roc_auc_score(test_cats, pred_prob))

In [0]:
#naives bayes
nbClass = GaussianNB()
nbClass.fit(train_vectors, train_cats)

preds = nbClass.predict(test_vectors)

# print classification report and accuracy:
print(classification_report(test_cats, preds))
print("Accuracy: ", accuracy_score(test_cats, preds))
    
# print the confusion matrix
cnf_matrix = confusion_matrix(test_cats, preds)
plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=classes, #normalize=True,
                      title='Confusion matrix with all features')

# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:
#Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic
pred_prob = nbClass.predict_proba(test_vectors)[:, 1]
print("ROC_AOC_Score: ", roc_auc_score(test_cats, pred_prob))

In [0]:
#support vector machine
svcClass = SVC(class_weight='balanced', probability=True)
svcClass.fit(train_vectors, train_cats)

preds = svcClass.predict(test_vectors)

# print classification report and accuracy:
print(classification_report(test_cats, preds))
print("Accuracy: ", accuracy_score(test_cats, preds))

    
# print the confusion matrix
cnf_matrix = confusion_matrix(test_cats, preds)
plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=classes, #normalize=True,
                      title='Confusion matrix with all features')

# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:
#Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic
pred_prob = svcClass.predict_proba(test_vectors)[:, 1]
print("ROC_AOC_Score: ", roc_auc_score(test_cats, pred_prob))

In [0]:
# print('the type of preds:', type(preds))

# value = 1
# correct_count = 0
# incorrect_count = 0
# for i in range(len(preds)):
#   if preds[i] == value:
#     print(f"index:{i}, pred:{preds[i]}, true:{test_cats.iloc[i]}")
#     if preds[i] == test_cats.iloc[i]:
#        correct_count = correct_count + 1 
#     if preds[i] != test_cats.iloc[i]:
#        incorrect_count = incorrect_count + 1 

# print(f"number of predicted {value}:{len(preds[preds == value])}")
# print(f"number of correct {value}:{correct_count}")
# print(f"number of incorrect {value}:{incorrect_count}")