In [1]:
import re
import pandas as pd
import numpy as np

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import os
import os.path

path = "/content/sample_data/" 
os.chdir(path)

In [4]:
df = pd.read_csv('lyme_tweets_010110_123121.csv')
len(df)

33996

In [5]:
# These keywords can serve as n-grams that can be vectorized into categorical variables to build text classification model
# More keywords needed from Dr. Elda
# Once text classification is built, test the model against 2021 tweets, and correlate to actual CDC cases of Lyme disease
keywords = ['#lymedisease', 'lymedisease', 'lyme disease', 'neurological lyme', 'neuro lyme', 'long-haul lyme', 'long haul lyme', 
            'have lyme', 'had lyme', 'has lyme', 'may have lyme', 'might have lyme', 'may have had lyme', 'had the lyme', 'having lyme',
            'neurological lyme disease', 'neuro lyme disease', 'long-haul lyme disease', 'long haul lyme disease', 'specialist', 
            'physician', 'doctor', 'neurologist', 'dermatologist', 'skin rash', 'rash', 'flu symptom', 'flu like symptom', 'flu symptoms', 'flu like symptoms', 
            'fever', 'aches', 'joint ache', 'joint pain','hiking', 'hiked in forest', 'forest hikes','bulls eye', 'forest', 'tick bite', 'bitten by tick', 'death', 
            'painful', 'pain', 'ticks', 'tick', 'red color', 'swollen', 'health', 'healthcare', 'medical care', 'med check', 'medical checkup', 'lyme illness',
            'late stage lyme', 'early stage lyme', 'antibiotic', 'inflammation', 'heart', 'lyme treatment', 'lyme awareness', 'lyme prevention', 
            'lyme patient', 'recover']

In [6]:
# Create new column with all tweets in lowercase
lowercase_words = df['text'].str.lower()
df['lowercase_tweets'] = lowercase_words

# Dataframe with tweets relevant to Lyme disease only
# These tweets can be labeled as '1' for having relevance to Lyme disease for text classification model
lyme_disease_tweet_df = df[df.lowercase_tweets.str.contains('|'.join(keywords))]

# Dataframe with tweets not relevant to Lyme disease
# These tweets can be labeled as '0' for having no relevance to Lyme disease for text classification model
non_lyme_disease_tweet_df = df[-df.lowercase_tweets.str.contains('|'.join(keywords))]

In [7]:
lyme_disease_tweet_df['Lyme Cases'] = 1
non_lyme_disease_tweet_df['Lyme Cases'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
combined_df = pd.concat([lyme_disease_tweet_df, non_lyme_disease_tweet_df])

In [9]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
    
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [15]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
    
combined_df['clean_text'] = combined_df['lowercase_tweets'].apply(lambda x: finalpreprocess(x))
combined_df.head()

KeyboardInterrupt: ignored

In [11]:
# Load CSV with Cleaned Up and Labeled Data
combined_df = pd.read_csv('cleaned_labeled_lyme_tweet.csv')
len(combined_df)

33996

In [12]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(combined_df['clean_text'],combined_df['Lyme Cases'], test_size=0.2, shuffle=True)

#Word2Vec
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [13]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
        return self
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

combined_df['clean_text_tok']=[nltk.word_tokenize(i) for i in combined_df['clean_text']]
model = Word2Vec(combined_df['clean_text_tok'],min_count=1)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))     
modelw = MeanEmbeddingVectorizer(w2v)

# converting text to numerical data using Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)



In [14]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3049
           1       0.96      0.94      0.95      3751

    accuracy                           0.95      6800
   macro avg       0.95      0.95      0.95      6800
weighted avg       0.95      0.95      0.95      6800

Confusion Matrix: [[2906  143]
 [ 207 3544]]
AUC: 0.9819027159609957


In [21]:
len(X_train)

27196

In [49]:
# Test if keywords are useful to isolate tweets related to Lyme disease
# relevant_tweet_list = [tweet for tweet in lowercase_words if any(s in tweet for s in keywords)]

In [7]:
# Create CSV with rows relevant to Lyme disease
lyme_disease_tweet_df.to_csv('relevant_lyme_tweet.csv', index=False, header=True)