In [0]:
'''set the environment'''
#import needed packages
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
import fnmatch

#data path
path = ""


#create and set database
# spark.sql(f"DROP DATABASE IF EXISTS {database} CASCADE")
# spark.sql(f"CREATE DATABASE {database}")
spark.sql(f"USE {database}")

In [0]:
!pip install gensim

In [0]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [0]:
df = spark.table('textTable').toPandas()
print('textTable shape:', df.shape)
df.head()

In [0]:
#separte gold from the rest.
shortlist = ['contraindication', 'no contraindication']

gold = df[df['label'].isin(shortlist)]
rest = df[~df['label'].isin(shortlist)]

print('gold:', gold.shape)
print('rest:', rest.shape)
print('the type:', type(gold))

In [0]:
#strip_handles removes personal information such as twitter handles, which don't
#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.
tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)
mystopwords = set(stopwords.words("english"))

#Function to tokenize tweets, remove stopwords and numbers. 
#Keeping punctuations and emoticon symbols could be relevant for this task!
def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that removes stopwords and digits from a list of tokens
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]

#Split data into train and test
train_content = preprocess_corpus(gold['text'])
train_cats = gold['label']

test_content = preprocess_corpus(rest['text'])
test_cats = rest['label']

print("length of train data:", len(train_content), len(train_cats))
print("length of test data:", len(test_content), len(test_cats))
print('the type of the content:', type(test_content))
print('the type of the cats:', type(test_cats))

In [0]:
#read data
i = 10
print(rest.iloc[i])
print(test_cats.iloc[i])
print(test_content[i])

In [0]:
#prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_content)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

In [0]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")
#infer in multiple steps to get a stable representation. 
train_vectors =  [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_content]
test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_content]

#Use any regular classifier like logistic regression
from sklearn.linear_model import LogisticRegression

myclass = LogisticRegression(class_weight="balanced") #because classes are not balanced. 
myclass.fit(train_vectors, train_cats)

preds = myclass.predict(test_vectors)
from sklearn.metrics import classification_report, confusion_matrix
# print(classification_report(test_cats, preds))


In [0]:
print('the type of preds:', type(preds))

value = 'contraindication'
print(f"number of predicted {value}:{len(preds[preds == value])}")
for i in range(len(preds)):
  if preds[i] == value:
    print(f"index:{i}, filename:{rest['fileName'].iloc[i]}, pred:{preds[i]}")
    