# Compare classification methods for identifying Covid-19 myths in tweets
## Includes training data balancing

@authors: Alexander Chen, Jaren Haber<br>
@affiliation: Massive Data Institute, McCourt School of Public Policy, Georgetown University<br>
@date: November 4, 2020

'''
Trains classifiers to predict whether a tweet is about a given coronavirus mtyh. Uses preliminary labeled tweet data (440 per myth) to train classifiers. Compares f1_weighted scores of three model structures using 10-Fold Cross Validation: K-Nearest Neighbors, Random Forest, and Decision Tree. Oversamples training data to .5 (1:2 minority:majority class).
'''

## Initialize

In [1]:
######################################################
# Import libraries
######################################################

import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import date

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

stemmer = WordNetLemmatizer()

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

import joblib
import csv

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

[nltk_data] Downloading package punkt to /home/ac1975/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load & inspect data

In [2]:
######################################################
# Define filepaths
######################################################

thisday = date.today().strftime("%d%m%y")

dis_fp = '../data/myth_disinfectants_sample-440-labeled.csv'
rem_fp = '../data/myth_home_remedies_sample-440-labeled.csv'
wth_fp = '../data/myth_weather_sample-440-labeled.csv'

dis_mod_fp = f'../models/tweet_classifier_disinfectants_{str(thisday)}.joblib'
rem_mod_fp = f'../models/tweet_classifier_home_remedies_{str(thisday)}.joblib'
wth_mod_fp = f'../models/tweet_classifier_weather_{str(thisday)}.joblib'

dis_vec_fp = f'../models/vectorizer_disinfectants_{str(thisday)}.joblib'
rem_vec_fp = f'../models/vectorizer_home_remedies_{str(thisday)}.joblib'
wth_vec_fp = f'../models/vectorizer_weather_{str(thisday)}.joblib'

dis_vec_feat_fp = f'../models/vectorizer_features_disinfectants_{str(thisday)}.csv'
rem_vec_feat_fp = f'../models/vectorizer_features_home_remedies_{str(thisday)}.csv'
wth_vec_feat_fp = f'../models/vectorizer_features_weather_{str(thisday)}.csv'

In [3]:
######################################################
# Load and rename the data
######################################################

dis_df = pd.read_csv(dis_fp, low_memory=False)
rem_df = pd.read_csv(rem_fp, low_memory=False)
wth_df = pd.read_csv(wth_fp, low_memory=False)

# Rename each DF's myth columns for clarity
dis_df.rename(inplace = True, copy = False, 
              columns = {'is_myth': 'dis_is_myth', 'myth_score': 'dis_myth_score', 
                         'is_myth_supports': 'dis_is_myth_supports', 'myth_supports_score': 'dis_myth_supports_score'}
             )
rem_df.rename(inplace = True, copy = False, 
              columns = {'is_myth': 'rem_is_myth', 'myth_score': 'rem_myth_score', 
                         'is_myth_supports': 'rem_is_myth_supports', 'myth_supports_score': 'rem_myth_supports_score'}
             )
wth_df.rename(inplace = True, copy = False, 
              columns = {'is_myth': 'wth_is_myth', 'myth_score': 'wth_myth_score', 
                         'is_myth_supports': 'wth_is_myth_supports', 'myth_supports_score': 'wth_myth_supports_score'}
             )

rem_df.head(10)

Unnamed: 0,tweet_id,text,rem_is_myth,rem_myth_score,rem_is_myth_supports,rem_myth_supports_score
0,1230594710667837440,@USER01 DJI improves temperature-measuring dro...,no,1.0,no,1.0
1,1239221464080871424,Microban 24 Hour Disinfectant Sanitizing Spray...,no,1.0,no,1.0
2,1229926198156808192,Travel Tip: Looks like the #coronavirus can su...,no,0.666667,no,0.666667
3,1236415894894424067,Solution to our global pandemic! \n\nBut then ...,no,0.666667,no,0.666667
4,1238144576910766080,Now it all makes sense why China was spraying ...,no,1.0,no,1.0
5,1234211333937270790,"Fear, distrust and disinfectant in the air ami...",no,1.0,no,1.0
6,1235148391765757952,Can regularly rinsing your nose with saline he...,yes,1.0,unsure,1.0
7,1238600642136100864,"The shelves in Walmart are empty. Water, disin...",no,1.0,no,1.0
8,1237484894772531205,@USER01 Thank you. I’m still using the spray I...,no,1.0,no,1.0
9,1236889906686955520,Would you get a disinfectant service at your h...,no,0.666667,no,0.666667


In [4]:
######################################################
# Summarize the numerical data
######################################################

# Look at the number of instances of each is_myth
# class distribution
print(dis_df.groupby('dis_is_myth').size())
print()
print(rem_df.groupby('rem_is_myth').size())
print()
print(wth_df.groupby('wth_is_myth').size())
print()

# Look at the number of instances of each is_myth_supports class distribution
print(dis_df.groupby('dis_is_myth_supports').size())
print()
print(rem_df.groupby('rem_is_myth_supports').size())
print()
print(wth_df.groupby('wth_is_myth_supports').size())

dis_is_myth
no     413
yes     27
dtype: int64

rem_is_myth
no     406
yes     34
dtype: int64

wth_is_myth
no      21
yes    419
dtype: int64

dis_is_myth_supports
no        434
unsure      4
yes         2
dtype: int64

rem_is_myth_supports
no        417
unsure      9
yes        14
dtype: int64

wth_is_myth_supports
no        195
unsure    106
yes       139
dtype: int64


In [5]:
######################################################
# Summarize the text data
######################################################

# See examples of two tweets
# Note that usernames and URLs have already been replaced, but hashtags remain
print("Example tweet on home remedies 1:\n", rem_df['text'][0])
print()
print("Example tweet on home remedies 2:\n", rem_df['text'][1])
print()

# Look at size of vocabulary
# Add words from each tweet to empty list:
tweet_tokens = []; rem_df['text'].apply(lambda x: tweet_tokens.extend(word_tokenize(x)))
print('Vocab size:', len(set(tweet_tokens)))
print()

# Check out most frequent words in unprocessed text
freq = Counter(tweet_tokens)
print('20 most frequent words in raw home remedies tweets:')
freq.most_common(20)

Example tweet on home remedies 1:
 @USER01 DJI improves temperature-measuring drones with a simple cotton swab
DJI drones have been helping tackle the coronavirus outbreak with temperature screening and disinfectant spray
<em>URL01 Removed</em>
#COVID19
#新冠病毒
#武汉肺炎

Example tweet on home remedies 2:
 Microban 24 Hour Disinfectant Sanitizing Spray Citrus Scent 15 fl oz Surface <em>URL01 Removed</em> #coronavirus #covid_19 #covid19 <em>URL02 Removed</em>

Vocab size: 3275

20 most frequent words in raw home remedies tweets:


[('#', 2159),
 ('<', 706),
 ('>', 706),
 ('.', 422),
 (',', 385),
 ('em', 353),
 ('Removed', 353),
 ('/em', 353),
 ('URL01', 317),
 ('coronavirus', 314),
 ('the', 296),
 ('disinfectant', 293),
 (':', 252),
 ('to', 214),
 ('of', 199),
 ('and', 179),
 ('@', 164),
 ('in', 149),
 ('Disinfectant', 144),
 ('CoronavirusOutbreak', 139)]

## Preprocess tweet text

In [6]:
######################################################
# Tweet Preprocessing
######################################################

def process_tweets(tweet):
    '''
    Preprocesses raw text of a tweet by lower-casing, stripping whitespace, 
    
    args:
        tweet: raw text of a tweet
    '''
    
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URL signifiers like '<em>URL01 Removed</em>'
    url_pattern = r'<em>url\d{2}\sremoved<\/em>'
    tweet = re.sub(url_pattern, '', tweet)
    
    # Remove username signifiers like '@USER01'
    user_pattern = r'@user\d{2}'
    tweet = re.sub(user_pattern, '', tweet)
        
    # Remove additional white spaces
    whitespace_pattern = r'\s+'
    tweet = re.sub(whitespace_pattern, ' ', tweet) # strip whitespaces in between words
    tweet = tweet.strip() # strip whitespaces at start & end
    
    # Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    # Remove emojis
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags = re.UNICODE)
    tweet = re.sub(emoji_pattern, '', tweet)
    
    # Lemmatization
    tweet = tweet.split()
    tweet = ' '.join([stemmer.lemmatize(word) for word in tweet])
        
    return tweet


dis_df['text_cleaned'] = dis_df['text'].apply(lambda x: process_tweets(x))
rem_df['text_cleaned'] = rem_df['text'].apply(lambda x: process_tweets(x))
wth_df['text_cleaned'] = wth_df['text'].apply(lambda x: process_tweets(x))

In [7]:
# Take another look at first two tweets (after preprocessing)
print("Example tweet on home remedies 1 (cleaned):\n", rem_df['text_cleaned'][0])
print()
print("Example tweet on home remedies 2 (cleaned):\n", rem_df['text_cleaned'][1])
print()

# Check out vocab size after cleaning
# Add words from each cleaned tweet to empty list:
tweet_tokens_cleaned_dis = []; dis_df['text_cleaned'].apply(lambda x: tweet_tokens_cleaned_dis.extend(word_tokenize(x))) 
tweet_tokens_cleaned_rem = []; rem_df['text_cleaned'].apply(lambda x: tweet_tokens_cleaned_rem.extend(word_tokenize(x))) 
tweet_tokens_cleaned_wth = []; wth_df['text_cleaned'].apply(lambda x: tweet_tokens_cleaned_wth.extend(word_tokenize(x))) 
print("Vocabulary sizes for preprocessed tweets labeled for each myth:")
print('Disinfectants:', len(set(tweet_tokens_cleaned_dis)))
print('Home remedies:', len(set(tweet_tokens_cleaned_rem)))
print('Weather:', len(set(tweet_tokens_cleaned_wth)))
print()

# Check out most frequent words in preprocessed text
freq = Counter(tweet_tokens_cleaned_rem)
print('20 most frequent words in cleaned home remedies tweets:')
freq.most_common(20)

Example tweet on home remedies 1 (cleaned):
 dji improves temperature-measuring drone with a simple cotton swab dji drone have been helping tackle the coronavirus outbreak with temperature screening and disinfectant spray covid19 新冠病毒 武汉肺炎

Example tweet on home remedies 2 (cleaned):
 microban 24 hour disinfectant sanitizing spray citrus scent 15 fl oz surface coronavirus covid_19 covid19

Vocabulary sizes for preprocessed tweets labeled for each myth:
Disinfectants: 2769
Home remedies: 2716
Weather: 2694

20 most frequent words in cleaned home remedies tweets:


[('disinfectant', 445),
 ('coronavirus', 406),
 ('.', 400),
 (',', 385),
 ('the', 330),
 (':', 252),
 ('to', 223),
 ('of', 203),
 ('a', 198),
 ('and', 182),
 ('in', 155),
 ('coronavirusoutbreak', 145),
 ('spray', 133),
 ('covid19', 121),
 ('virus', 119),
 ('on', 109),
 ('for', 109),
 ('coronaoutbreak', 108),
 ('with', 106),
 ('flu', 100)]

In [8]:
######################################################
# Vectorize text
######################################################

# Use TFIDF weighted DTM because does better overall than unweighted
#vectorizer = CountVectorizer(max_features=10000, min_df=1, max_df=0.8, stop_words=stopwords.words('english')) # DTM
vectorizer = TfidfVectorizer(max_features=10000, min_df=1, max_df=0.8, stop_words=stopwords.words('english')) # TFIDF

# creates sparse DTM X
# use X.toarray() to get with zero representation

dis_tweets, rem_tweets, wth_tweets = [], [], [] # empty list to add tweets to
dis_df['text_cleaned'].apply(lambda x: dis_tweets.append(x)) # add tweet from each row of DF
rem_df['text_cleaned'].apply(lambda x: rem_tweets.append(x)) # add tweet from each row of DF
wth_df['text_cleaned'].apply(lambda x: wth_tweets.append(x)) # add tweet from each row of DF

X_dis = vectorizer.fit_transform(dis_tweets)
joblib.dump(vectorizer, open(dis_vec_fp, "wb"))
with open(dis_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])

X_rem = vectorizer.fit_transform(rem_tweets)
joblib.dump(vectorizer, open(rem_vec_fp, "wb"))
with open(rem_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])

X_wth = vectorizer.fit_transform(wth_tweets)
joblib.dump(vectorizer, open(wth_vec_fp, "wb"))
with open(wth_vec_feat_fp,'w') as f:
    writer = csv.writer(f)
    writer.writerows([vectorizer.get_feature_names()])

print('Number of features in vectorizer (total vocabulary):', len(vectorizer.get_feature_names()))
print()

print(vectorizer.get_feature_names()[::100]) # get every 100th word

Number of features in vectorizer (total vocabulary): 2421

['00', 'agree', 'attributable', 'bu', 'clinics', 'coronaviruspakistan', 'december', 'dude', 'export', 'garlic', 'help', 'indonesia', 'kong', 'magical', 'myanmar', 'page', 'predictions', 'red', 'sauna', 'slows', 'strand', 'tent', 'trustworthy', 'vxx', 'wuhanflu']


In [9]:
######################################################
# Specify the data
######################################################

dis_df_is_myth = dis_df[['text_cleaned','dis_is_myth']]
rem_df_is_myth = rem_df[['text_cleaned','rem_is_myth']]
wth_df_is_myth = wth_df[['text_cleaned','wth_is_myth']]

dis_df_is_myth_supports = dis_df[['text_cleaned','dis_is_myth_supports']].copy()
rem_df_is_myth_supports = rem_df[['text_cleaned','rem_is_myth_supports']].copy()
wth_df_is_myth_supports = wth_df[['text_cleaned','wth_is_myth_supports']].copy()

In [10]:
######################################################
# Convert No/Yes to [0,1]
######################################################

def no_yes_convert(convert_df, column_name, has_unsure = False):
    '''
    args
        convert_df: df containing column to convert
        column_name: column to convert from 'yes','no','unsure' to float. Scoring scheme:
            no: 0
            unsure: 0.5
            yes: 1
        has_unsure: boolean, indicates whether convert_df has 'unsure' in column_name
    '''
    
    # Already converted to float
    if convert_df[column_name].dtype == 'float64':
        return convert_df
    
    new_df = convert_df.loc[:, convert_df.columns != column_name]
    
    for num in range(0,len(new_df)):
        row_index = new_df.index[num]
        
        if convert_df.loc[num,column_name] == 'no':
            new_df.loc[row_index,column_name] = 0.0
            
        elif convert_df.loc[num,column_name] == 'yes':
            new_df.loc[row_index,column_name] = 1.0
            
        elif has_unsure == True and convert_df.loc[num,column_name] == 'unsure':
            new_df.loc[row_index,column_name] = 0.5
            
    return new_df

In [11]:
######################################################
# Convert myData_is_myth to float
######################################################

dis_df_is_myth = no_yes_convert(dis_df_is_myth,'dis_is_myth')
rem_df_is_myth = no_yes_convert(rem_df_is_myth,'rem_is_myth')
wth_df_is_myth = no_yes_convert(wth_df_is_myth,'wth_is_myth')

print("Home remedies categories in raw data:")
print(rem_df['rem_is_myth'].value_counts())
print()
print("Home remedies cleaned up:")
print(rem_df_is_myth['rem_is_myth'].value_counts())

Home remedies categories in raw data:
no     406
yes     34
Name: rem_is_myth, dtype: int64

Home remedies cleaned up:
0.0    406
1.0     34
Name: rem_is_myth, dtype: int64


## Balance x_train, y_train

In [12]:
######################################################
# Balance x_train, y_train
######################################################

def resample_data(X_train, Y_train, undersample, sampling_strategy):
    """
    args
        X_train: X training data
        Y_train: Y training data
        undersmample: boolean for over or undersampling
        sampling_strategy: strategy for resampled distribution
            if oversample: 'majority' makes minority = to majority
            if undersample: 'minority' makes majority = to minority
    """
    
    if undersample == True:
        undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
        X_balanced, Y_balanced = undersample.fit_resample(X_train, Y_train)
    else:
        oversample = RandomOverSampler(sampling_strategy=sampling_strategy)
        X_balanced, Y_balanced = oversample.fit_resample(X_train, Y_train)
    
    print(f'Y_train: {Counter(Y_train)}\nY_resample: {Counter(Y_balanced)}')
    
    return X_balanced, Y_balanced

## Evaluate algorithms: Disinfectants myths

In [13]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = dis_df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_dis, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring='f1_weighted'

Y_train Distribution: [(0.0, 331), (1.0, 21)]


In [14]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .5
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({0.0: 331, 1.0: 21})
Y_resample: Counter({0.0: 331, 1.0: 165})


### 10-Fold Cross Validation: Disinfectants myths

In [15]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9643, (0.0291)
RF: 1.0, (0.0)
DT: 0.9879, (0.0161)


### KNN: Disinfectants myths

In [16]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_dis = KNeighborsClassifier()
knn_dis.fit(X_train, Y_train)
knn_predictions = knn_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_dis = KNeighborsClassifier()
knn_dis.fit(X_balanced, Y_balanced)
knn_predictions = knn_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9659090909090909
[[82  0]
 [ 3  3]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        82
         1.0       1.00      0.50      0.67         6

    accuracy                           0.97        88
   macro avg       0.98      0.75      0.82        88
weighted avg       0.97      0.97      0.96        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9318181818181818
[[77  5]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96        82
         1.0       0.50      0.83      0.62         6

    accuracy                           0.93        88
   macro avg       0.74      0.89      0.79        88
weighted avg       0.95      0.93      0.94        88



### Random Forest: Disinfectants myths

In [17]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_dis = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_dis.fit(X_train, Y_train) 
rf_predictions = rf_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################
rf_dis = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_dis.fit(X_balanced, Y_balanced) 
rf_predictions = rf_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9545454545454546
[[82  0]
 [ 4  2]]
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98        82
         1.0       1.00      0.33      0.50         6

    accuracy                           0.95        88
   macro avg       0.98      0.67      0.74        88
weighted avg       0.96      0.95      0.94        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9772727272727273
[[82  0]
 [ 2  4]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        82
         1.0       1.00      0.67      0.80         6

    accuracy                           0.98        88
   macro avg       0.99      0.83      0.89        88
weighted avg       0.98      0.98      0.98        88



### Decision Tree: Disinfectants myths

In [18]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_dis = DecisionTreeClassifier()
dt_dis.fit(X_train, Y_train)
dt_predictions = dt_dis.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
dt_dis = DecisionTreeClassifier()
dt_dis.fit(X_balanced, Y_balanced)
dt_predictions = dt_dis.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 331), (1.0, 21)]
0.9886363636363636
[[82  0]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        82
         1.0       1.00      0.83      0.91         6

    accuracy                           0.99        88
   macro avg       0.99      0.92      0.95        88
weighted avg       0.99      0.99      0.99        88


Balanced Classifier [(0.0, 331), (1.0, 165)]
0.9886363636363636
[[82  0]
 [ 1  5]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        82
         1.0       1.00      0.83      0.91         6

    accuracy                           0.99        88
   macro avg       0.99      0.92      0.95        88
weighted avg       0.99      0.99      0.99        88



In [19]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_dis, dis_mod_fp)

## Evaluate algorithms: Home remedies myths

In [20]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = rem_df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.2
seed = 3
X_train, X_validate, Y_train, Y_validate = train_test_split(X_rem, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring = 'f1_weighted'

Y_train Distribution: [(0.0, 324), (1.0, 28)]


In [21]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .5
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({0.0: 324, 1.0: 28})
Y_resample: Counter({0.0: 324, 1.0: 162})


### 10-Fold Cross Validation: Home remedies myths

In [22]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9241, (0.0531)
RF: 0.9979, (0.0062)
DT: 0.9636, (0.0216)


### KNN: Home remedies myths

In [23]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
knn_rem = KNeighborsClassifier()
knn_rem.fit(X_train, Y_train)
knn_predictions = knn_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################
# Make predictions on validation dataset
knn_rem = KNeighborsClassifier()
knn_rem.fit(X_balanced, Y_balanced)
knn_predictions = knn_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.9318181818181818
[[81  1]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        82
         1.0       0.50      0.17      0.25         6

    accuracy                           0.93        88
   macro avg       0.72      0.58      0.61        88
weighted avg       0.91      0.93      0.92        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.8522727272727273
[[71 11]
 [ 2  4]]
              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92        82
         1.0       0.27      0.67      0.38         6

    accuracy                           0.85        88
   macro avg       0.62      0.77      0.65        88
weighted avg       0.92      0.85      0.88        88



### Random Forest: Home remedies myths

In [24]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

rf_rem = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_rem.fit(X_train, Y_train) 
rf_predictions = rf_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

rf_rem = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_rem.fit(X_balanced, Y_balanced) 
rf_predictions = rf_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.9431818181818182
[[82  0]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        82
         1.0       1.00      0.17      0.29         6

    accuracy                           0.94        88
   macro avg       0.97      0.58      0.63        88
weighted avg       0.95      0.94      0.92        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.9431818181818182
[[82  0]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        82
         1.0       1.00      0.17      0.29         6

    accuracy                           0.94        88
   macro avg       0.97      0.58      0.63        88
weighted avg       0.95      0.94      0.92        88



### Decision Tree: Home remedies myths

In [25]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################

dt_rem = DecisionTreeClassifier()
dt_rem.fit(X_train, Y_train)
dt_predictions = dt_rem.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################

dt_rem = DecisionTreeClassifier()
dt_rem.fit(X_balanced, Y_balanced)
dt_predictions = dt_rem.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(0.0, 324), (1.0, 28)]
0.8977272727272727
[[78  4]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95        82
         1.0       0.20      0.17      0.18         6

    accuracy                           0.90        88
   macro avg       0.57      0.56      0.56        88
weighted avg       0.89      0.90      0.89        88


Balanced Classifier [(0.0, 324), (1.0, 162)]
0.8863636363636364
[[77  5]
 [ 5  1]]
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94        82
         1.0       0.17      0.17      0.17         6

    accuracy                           0.89        88
   macro avg       0.55      0.55      0.55        88
weighted avg       0.89      0.89      0.89        88



In [26]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_rem, rem_mod_fp)

## Evaluate algorithms: Weather myths

In [27]:
######################################################
# Prepare training and validation data
######################################################

# Separate training and final validation data set. First remove class
# label from data (X). Setup target class (Y)
# Then make the validation set 10% of the entire
# set of labeled data (X_validate, Y_validate)

valueArray = wth_df_is_myth.values
Y = valueArray[:,1]
Y = Y.astype('float')
test_size = 0.5
seed = 15
X_train, X_validate, Y_train, Y_validate = train_test_split(X_wth, Y, test_size=test_size, random_state=seed)

print(f'Y_train Distribution: {Counter(Y_train).most_common()}')

# Setup 10-fold cross validation to estimate the accuracy of different models
# Split data into 10 parts
# Test options and evaluation metric
num_folds = 10
# num_instances = len(X_train)
seed = 7
scoring = 'f1_weighted'

Y_train Distribution: [(1.0, 207), (0.0, 13)]


### 10-Fold Cross Validation: Weather myths

In [28]:
######################################################
# Use different algorithms to build models
######################################################

# Add each algorithm and its name to the model array
models = []
models.append(('KNN',KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, random_state=0)))
models.append(('DT', DecisionTreeClassifier()))

# Evaluate each model, add results to a results array,
# Print the accuracy results (remember these are averages and std)
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_balanced, Y_balanced, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {round(cv_results.mean(),4)}, ({round(cv_results.std(),4)})')

KNN: 0.9241, (0.0531)
RF: 0.9979, (0.0062)
DT: 0.9656, (0.0201)


In [29]:
######################################################
# Undersample to minority size
######################################################
sampling_strategy = .5
undersample = False

X_balanced, Y_balanced = resample_data(X_train, Y_train, undersample=undersample, sampling_strategy=sampling_strategy)

Y_train: Counter({1.0: 207, 0.0: 13})
Y_resample: Counter({1.0: 207, 0.0: 103})


### KNN: Weather myths

In [30]:
######################################################
# Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_wth = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_wth = KNeighborsClassifier()
knn_wth.fit(X_train, Y_train)
knn_predictions = knn_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))

######################################################
# Balanced: Compare algorithms on validation test: KNN
######################################################

# Make predictions on validation dataset
# from sklearn.ensemble import BaggingClassifier # improves estimates but hard with so little data
#knn_wth = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

knn_wth = KNeighborsClassifier()
knn_wth.fit(X_balanced, Y_balanced)
knn_predictions = knn_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, knn_predictions))
print(confusion_matrix(Y_validate, knn_predictions))
print(classification_report(Y_validate, knn_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220


Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9181818181818182
[[  1   7]
 [ 11 201]]
              precision    recall  f1-score   support

         0.0       0.08      0.12      0.10         8
         1.0       0.97      0.95      0.96       212

    accuracy                           0.92       220
   macro avg       0.52      0.54      0.53       220
weighted avg       0.93      0.92      0.93       220



  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest: Weather myths

In [31]:
######################################################
# Compare algorithms on validation test: Random Forest
######################################################

#rf_wth = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_wth = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_wth.fit(X_train, Y_train) 
rf_predictions = rf_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Random Forest
######################################################

#rf_wth = BaggingClassifier(RandomForestClassifier(n_estimators=1000, random_state=0), max_samples=0.5, max_features=0.5)

rf_wth = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_wth.fit(X_balanced, Y_balanced) 
rf_predictions = rf_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, rf_predictions))
print(confusion_matrix(Y_validate, rf_predictions))
print(classification_report(Y_validate, rf_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220



  _warn_prf(average, modifier, msg_start, len(result))



Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9636363636363636
[[  0   8]
 [  0 212]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      1.00      0.98       212

    accuracy                           0.96       220
   macro avg       0.48      0.50      0.49       220
weighted avg       0.93      0.96      0.95       220



  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree: Weather myths

In [32]:
######################################################
# Compare algorithms on validation test: Decision Tree
######################################################
#dt_wth = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_wth = DecisionTreeClassifier()
dt_wth.fit(X_train, Y_train)
dt_predictions = dt_wth.predict(X_validate)

print()
print(f'Unbalanced Classifier {Counter(Y_train).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))

######################################################
# Balanced: Compare algorithms on validation test: Decision Tree
######################################################
#dt_wth = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)

dt_wth = DecisionTreeClassifier()
dt_wth.fit(X_balanced, Y_balanced)
dt_predictions = dt_wth.predict(X_validate)

print()
print(f'Balanced Classifier {Counter(Y_balanced).most_common()}')
print(accuracy_score(Y_validate, dt_predictions))
print(confusion_matrix(Y_validate, dt_predictions))
print(classification_report(Y_validate, dt_predictions))


Unbalanced Classifier [(1.0, 207), (0.0, 13)]
0.9363636363636364
[[  0   8]
 [  6 206]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         8
         1.0       0.96      0.97      0.97       212

    accuracy                           0.94       220
   macro avg       0.48      0.49      0.48       220
weighted avg       0.93      0.94      0.93       220


Balanced Classifier [(1.0, 207), (0.0, 103)]
0.9090909090909091
[[  4   4]
 [ 16 196]]
              precision    recall  f1-score   support

         0.0       0.20      0.50      0.29         8
         1.0       0.98      0.92      0.95       212

    accuracy                           0.91       220
   macro avg       0.59      0.71      0.62       220
weighted avg       0.95      0.91      0.93       220



In [33]:
######################################################
# Save best model
######################################################

# joblib.dump(rf_wth, wth_mod_fp)