# Baseline

In [4]:
#Baseline model, focuses on feature engineering traditional NLP techniques such as tfidf, sentiment and subjectivity
#List can be found in feature_engineering script. Uses these to feed a range of classifiers - Random Forest, SVM,
#Gradient Boost, Naive Bayes and K-nearest Neighbour - details of code can be found in ml script. Uses a cross-validation
#approach with five folds for on first annotated set of data. Second set of annotated data used as holdout for pure
#testing purposes. We also filter tweets that have emoticons and compare the performance with the same tweets 

In [None]:
import sys
sys.path.insert(0, 'scripts/')

In [None]:
import utility
import evaluation1 as ev
import feature_engineering1 as fe1
import feature_engineering as fe

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score, train_test_split, cross_val_predict, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import string
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from textblob import TextBlob as tb
import emoji
from emoji import UNICODE_EMOJI
import spacy
from spacymoji import Emoji
%matplotlib inline


ps = nltk.PorterStemmer()
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nlp = spacy.load('en')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

Abbreviation list created compiled manually, contains just under 600 abbreviations. Also set the suite classifier used in the baseline model. To add a classifier add it to the methods list and a corresponding function to ML class.

In [None]:
abbreviations = pd.read_csv('data/other/abbreviations.csv')['Abbreviation'].tolist()
abbreviations = [str(a).strip() for a in abbreviations]
methods = ['RandomForest', 'GradientBoost', 'KNN', 'SVM', 'NaiveBayes']

## Data (Already preprocessed using 01 preprocessing notebook)

In [None]:
fileName = 'data/baseline/dataOut/annSchiz1.csv'
fileName2 = 'data/baseline/dataOut/annSchiz2.csv'
socialDf = pd.read_csv(fileName, encoding = 'utf-8')
socialDf2 = pd.read_csv(fileName2, encoding = 'utf-8')

## Feature Engineering

These functions generate the features using the FeatureEngineering class in the fe script that contains code for tf-idf model (getTFIDF) and desciptive features (getFeatures) such as sentiment.

In [None]:
'''
get descriptive features such as sentiment
from the FeatureEngineering scropt inside the
fe script
'''
def getFeatures(socialDf):
    featuresEm = fe.FeatureEngineering(socialDf)
    tweets = features.getFeatures('Tweet')
    return tweets

'''
returns only columns in dataframe that pertain to
features
'''
def getFeatureColumns(tweets):
    fCols = tweets[tweets.columns.difference(["Tweet", "Classification"])].columns
    cols = tweets[tweets.columns.difference(["Classification"])].columns
    return fCols, cols

'''
returns tf-idf vectorizer object, tf-idf matrix
and labels
'''
def getTFIDF(tweets, fCols, cols):
    tfidfAll, xVectAll, tfidf = fe.gettfidfVectors(tweets, fCols)
    labels = tweets['Classification']
    return tfidfAll, xVectAll, tfidf, labels

## ML classifier class 

This class contains code to train classifiers listed in methods above. In order to add a classifier simply wrap its
scikit-learn function in a function named getFunction() and return classifier object. This class can also perform cross-valiadtion as well training and predicting

In [None]:
'''
Ml class contains logic for classification across a range of classifiers
'''
class baseML():
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        self.classifiers = None
    
    '''
    get cross validation results, k=5 by default
    '''
    def getCrossValidation(self, clf, k_fold=2):
        return cross_val_predict(clf, self.features, self.labels, cv=k_fold, n_jobs = -1)

    '''
    get predictions for all classifiers and evaluation metrics
    '''
    def getAllPredictions(self, methods=['RandomForest']):
        self.classifiers = [getattr(self,'get'+f)() for f in methods]
        predictions = map(self.getCrossValidation, self.classifiers)
        evaluation =  self.getEval(predictions, methods)
        #confusion = self.getEval(predictions, methods)
        return evaluation
    
    '''
    train passed classifiers on class features and labels
    '''
    def trainAllClassifiers(self, methods=['RandomForest', 'GradientBoost', 'KNN', 'SVM', 'NaiveBayes']):
        self.classifiers = [getattr(self,'get'+f)() for f in methods]
        classifiersTrained = [clf.fit(self.features, self.labels) for clf in self.classifiers]
        return classifiersTrained
         
    '''
    initiate evaluation object from ev script and get evaluation summary
    '''
    def getEval(self, predict, methods=['RandomForest', 'GradientBoost', 'KNN', 'SVM', 'NaiveBayes']):
        evalObj = ev.Evaluation([self.labels]*len(predict), predict)
        #results = eval('evalObj.get' + method + '()')
        results = evalObj.getSummary(methods)
        return results
    
    '''
    return random forest classifier object
    '''
    def getRandomForest(self, n=150):
        randFor = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1, random_state=0)
        return randFor
    
    '''
    return gradient boost classifier object
    '''
    def getGradientBoost(self, n=150):
        gradBoost = GradientBoostingClassifier(n_estimators=n, max_depth=None,  random_state=0)
        return gradBoost
    
    '''
    return k-NN classifier object
    '''
    def getKNN(self, n=3):
        knn = KNeighborsClassifier(n_neighbors=n)
        return knn
    
    '''
    return SVM classifier object
    '''
    def getSVM(self, kFunc='linear'):
        svm = SVC(kernel=kFunc, probability=True)
        return svm
   
    '''
    return naive bayes classifier object
    '''
    def getNaiveBayes(self):
        nb = GaussianNB()
        return nb

## Analysis

### Baseline - all tweets

In [None]:
'''
get baseline features, classify using 
cross-validation and store results
'''
def allMain(socialDf):
    features = fe.FeatureEngineering(socialDf)
    tweets = features.getFeatures('Tweet')
    fCols, cols = getFeatureColumns(tweets)
    tfidfAll, xVectAll, tfidf, labels = getTFIDF(tweets, fCols, cols)
    ml = baseML(xVectAll, labels)
    resultsAll = ml.getAllPredictions()
    return resultsAll

Execute baseline model for all tweets and save in the following path 'data/results/baseline/all'

In [None]:
resultsAll = allMain(socialDf)

In [127]:
results = utility.getEvalDf([resultsAll], 'data/results/baseline/all', 0, methods)

In [None]:
'''
returns important features of classifier
'''
def getImportantFeatures(clf, features):
    
    important = clf.feature_importances_
    importantDf = pd.DataFrame({'feature': features.columns, 'importance': rf.feature_importances_})
    importantDf = importantDf.sort_values('importance',ascending=False).set_index('feature')

    return importantDf

### Emoticon Only tweets

Look only at emoticon tweets. This gets tweets containing emoticons only the same set of tweets with
the emoticons filtered out (there are 56)

In [14]:
emSchiz1Em = pd.read_csv('data/baseline/emoji/emSchiz1Em.csv', encoding='utf-8')
schiz1Em = pd.read_csv('data/baseline/emoji/schiz1Em.csv', encoding='utf-8')

In [6]:
'''
get baseline features, classify using 
cross-validation and store results
for emoticon tweets
'''
def emoticonMain(emSchiz1Em):
    features = fe.FeatureEngineering(emSchiz1Em)
    tweets = features.getFeatures('Tweet')
    fCols, cols = getFeatureColumns(tweets)
    xVectAll, labels = getTFIDF(tweets, fCols, cols)
    ml = baseML(xVectAll, labels)
    resultsEm = ml.getAllPredictions()
    return resultsEm

execute to get performance for tweets containing emoticons only

In [133]:
resultsEm = emoticonMain(emSchiz1Em)
results = utility.getEvalDf([resultsEm], 'data/results/baseline/emoticon', 0, methods)

In [134]:
results

Unnamed: 0,accuracy,f1,precision,recall,roc_auc
RandomForest,0.806061,0.848095,0.783333,0.938095,0.908571
GradientBoost,0.809091,0.797483,0.87619,0.757143,0.850476
KNN,0.645455,0.635198,0.760952,0.661905,0.674286
SVM,0.736364,0.704139,0.863095,0.695238,0.879048
NaiveBayes,0.609091,0.630556,0.660476,0.614286,0.58381


In [16]:
'''
get baseline features, classify using 
cross-validation and store results
for emoticon tweets, without emoticons
'''
def noEmoticonMain(schiz1Em):
    features = fe.FeatureEngineering(schiz1Em)
    tweets = features.getFeatures('Tweet')
    fCols, cols = getFeatureColumns(tweets)
    xVectAll, labels = getTFIDF(tweets, fCols, cols)
    noMl = baseML(xVectAll, labels)
    noResultsEm = noMl.getAllPredictions()
    return noResultsEm

execute to get performance for tweets containing emoticons only, with emoticons removed

In [17]:
noResultsEm=noEmoticonMain(schiz1Em)
results = utility.getEvalDf([noResultsEm], 'data/results/baseline/noEmoticon', 0, methods)


In [18]:
results

Unnamed: 0,accuracy,f1,precision,recall,roc_auc
RandomForest,0.822727,0.851941,0.816667,0.909524,0.922857
GradientBoost,0.809091,0.817483,0.862857,0.785714,0.830952
KNN,0.645455,0.633333,0.753333,0.661905,0.684286
SVM,0.75303,0.739472,0.840952,0.72381,0.864762
NaiveBayes,0.592424,0.624149,0.642857,0.614286,0.57381


### Uppercase Only

This analysis is not included in the baseline study for the paper

In [372]:
def getLowerText(sentence, abbreviations, strip=False):
    if any(i.isupper() and i != 'RT' and i not in abbreviations for i in sentence)==strip
        sentence = np.nan
    return sentence

In [373]:
tokens = socialDf
tokens['Tweet'] = socialDf['Tweet'].apply(lambda x: getLowerText(x, abbreviations))
tweets = getFeatures(tokens)
fCols, cols = getFeatureColumns(tweets)
xVectAll, labels = getTFIDF(tweets, fCols, cols)

In [383]:
ml = baseML(xVectAllEm, labels)
resultsLower = ml.getAllPredictions()

In [None]:
tokens = socialDf
tokens['Tweet'] = socialDf['Tweet'].apply(lambda x: getLowerText(x, abbreviations))
tweets = getFeatures(tokens)
fCols, cols = getFeatureColumns(tweets)
xVectAll, labels, xTrain, xTest, yTrain, yTest = getTFIDF(tweets, fCols, cols)

# Testing

this part looks at getting classification for the testing data

In [37]:
'''
use tf-idf vectorixer on test data and get corresponding 
descriptive features and then get classification using ml
and evaluation metrics using ml class and evaluation class
respecitvely
'''
def testMain(socialDf, socialDf2):
    
    featuresTrain = fe.FeatureEngineering(socialDf)
    tweetsTrain = featuresTrain.getFeatures('Tweet')
    fCols, cols = getFeatureColumns(tweetsTrain)
    tfidfAll, xVectAllTrain, tfidf, labelsTrain = getTFIDF(tweetsTrain, fCols, cols)
    
    featuresTest = fe.FeatureEngineering(socialDf2)
    tweetsTest = featuresTest.getFeatures('Tweet')
    fCols, cols = getFeatureColumns(tweetsTest)
    tfidfY  = tfidf.transform(socialDf2['Tweet'])
    labelsTest = tweetsTest['Classification']
    xVectAllTest = fe.getFeatureArray(tweetsTest, fCols, tfidfY, tfidfAll.get_feature_names())
    
    ml = baseML(xVectAllTrain, labelsTrain)
    classifiers = ml.trainAllClassifiers()
    predictions = ml.predictAllClassifiers(classifiers, xVectAllTest)
    
    e = ev.Evaluation(predictions, [labelsTest]*len(predictions))
    results = e.getSummary(['RandomForest', 'GradientBoost', 'KNN', 'SVM', 'NaiveBayes'])
    
    cm = [confusion_matrix(l, p) for l, p in zip([labelsTest]*len(predictions), predictions)]
    
    return results, cm
      

In [43]:
x, cm = testMain(socialDf, socialDf2)

In [44]:
cm

[array([[152,  15],
        [ 47, 116]]), array([[141,  26],
        [ 42, 121]]), array([[103,  64],
        [ 87,  76]]), array([[150,  17],
        [ 39, 124]]), array([[145,  22],
        [ 35, 128]])]

In [33]:
x.to_csv('data/results/baseline/test.csv')