In [1]:
import re
import nlp516
import nlp516.model
import nltk
import numpy as np
import pandas as pd
import sklearn
from types import SimpleNamespace
from sklearn.feature_extraction.text import CountVectorizer

## Dataset

In [4]:
#dataset = nlp516.data.PublicTrialRaw()
language = 'english'
if language == 'spanish':
    raw = nlp516.data.PublicSpanishDataset()
elif language=='english':
    raw = nlp516.data.PublicEnglishDataset()

# Preprocessing

In [5]:
tokenizer_map = nlp516.data.Tokenizer('english')
stopwords_map = nlp516.data.RemoveStopWords(language)
stemmer_map = nlp516.data.Stemmer(language)

def preprocess(dataset):
    def run(data):
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_urls_map)
        #en = nlp516.data.map_column(dataset.en, 'text', nlp516.data.casual_tokenize_map)
        data = nlp516.data.map_column(data, 'text', tokenizer_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.user_camelcase_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.hashtag_camelcase_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.to_lowercase)
        data = nlp516.data.map_column(data, 'text', stopwords_map)
        #data = nlp516.data.map_column(data, 'text', stemmer_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_words_with_numbers)
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_punctuation)
        return data
    return SimpleNamespace(train = run(dataset.train),
                           valid = run(dataset.valid))
dataset = preprocess(raw)

In [6]:
print('Original: {}'.format(raw.train.iloc[25].text))
print('Tokens: {}'.format(dataset.train.iloc[25].text))

Original: @WattersWorld @JesseBWatters @BillSpadea How about discussing this? #JoeBiden's "black box" of American success. #WhiteGenocide #ImmigrationInvasion #JesseWatters #WatersWorldhttps://t.co/k8bWjj7tnp
Tokens: ['watters', 'world', 'jesse', 'b', 'watters', 'bill', 'spadea', 'discussing', 'joe', 'bidens', 'black', 'box', 'american', 'success', 'white', 'genocide', 'immigration', 'invasion', 'jesse', 'watters', 'waters', 'world']


In [7]:
print('Original: {}'.format(raw.train.iloc[1].text))
print('Tokens: {}'.format(dataset.train.iloc[1].text))

Original: Why would young fighting age men be the vast majority of the ones escaping a war &amp; not those who cannot fight like women, children, and the elderly?It's because the majority of the refugees are not actually refugees they are economic migrants trying to get into Europe.... https://t.co/Ks0SHbtYqn
Tokens: ['would', 'young', 'fighting', 'age', 'men', 'vast', 'majority', 'ones', 'escaping', 'war', 'cannot', 'fight', 'like', 'women', 'children', 'elderly', 'majority', 'refugees', 'actually', 'refugees', 'economic', 'migrants', 'trying', 'get', 'europe']


In [8]:
print('Original: {}'.format(raw.train.iloc[26].text))
print('Tokens: {}'.format(dataset.train.iloc[26].text))

Original: It is a cruel irony that sikhs often **look** more exotic to American eyes, even though they tend to be among the most deeply assimilated of immigrant groups. of course the people they're mistaken for also don't deserve to be attacked. https://t.co/57NitaFqce
Tokens: ['cruel', 'irony', 'sikhs', 'often', 'look', 'exotic', 'american', 'eyes', 'even', 'though', 'tend', 'among', 'deeply', 'assimilated', 'immigrant', 'groups', 'course', 'people', 'theyre', 'mistaken', 'also', 'deserve', 'attacked']


## Train

In [9]:
#def random_shuffle(data):
#    idx = np.arange(en.shape[0])
#    shuffle = np.random.shuffle(idx)
#    return data.iloc[idx, :]
#dataset = random_shuffle(en)
train = dataset.train
test = dataset.valid

In [10]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, dataset.train.HS)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.HS)))

score: 0.7616666666666667
test: 0.708


In [11]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, train.TR)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.TR)))

score: 0.8845555555555555
test: 0.822


In [12]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, dataset.train.AG)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.AG)))

score: 0.8451111111111111
test: 0.803


In [13]:
#[model.vectorizer.id2word(id) for id in model.vectorizer.feature_id]

In [14]:
model = nlp516.model.MajorityBaseline()

In [15]:
model.fit(dataset.train.text, dataset.train.AG)

score: 0.8266666666666667


In [16]:
model.precision_score(dataset.train.text, dataset.train.AG)

  'precision', 'predicted', average, warn_for)


0.0

In [27]:
import sklearn.tree
import sklearn.naive_bayes

In [28]:
import itertools
def subtask_dataset(dataset, task):
    train = SimpleNamespace(x=dataset.train.text,
                                y=getattr(dataset.train, task))
    valid = SimpleNamespace(x=dataset.valid.text,
                            y=getattr(dataset.valid, task))
    return SimpleNamespace(train=train, valid=valid)

def eval_metrics(model, dataset):
    model.fit(dataset.train.x, dataset.train.y)
    return {'accuracy': model.score(dataset.valid.x, dataset.valid.y),
            'precision': model.precision_score(dataset.valid.x, dataset.valid.y),
            'recall': model.recall_score(dataset.valid.x, dataset.valid.y),
            'f1': model.f1_score(dataset.valid.x, dataset.valid.y)}

def instantiate_models(classifiers, vectorizers):
    models = {('MajorityBaseline', '-'): nlp516.model.MajorityBaseline()}
    models.update(
        {(c, v): nlp516.model.MlModel(classifier=classifiers[c](), 
                                      vectorizer=vectorizers[v]())
         for c, v in itertools.product(classifiers.keys(), vectorizers.keys())
        }
    )
    return models

def eval_models(classifiers, vectorizers, task, dataset):
    models = instantiate_models(classifiers, vectorizers)
    results = {key: eval_metrics(model, dataset=subtask_dataset(dataset, task))
               for key, model in models.items()}
    return pd.DataFrame(results).transpose()



In [33]:
task = 'AG'
classifiers={'linear': lambda: sklearn.linear_model.LogisticRegression(),
             'svm': lambda: sklearn.svm.SVC(gamma='scale'),
             'tree': lambda: sklearn.tree.DecisionTreeClassifier(),
             'bayes': lambda: sklearn.naive_bayes.GaussianNB()}
vectorizers = {'frequency': lambda: nlp516.vectorizer.Unigram(100),
               'presence': lambda: nlp516.vectorizer.UnigramPresence(100)}

results = eval_models(classifiers=classifiers, vectorizers=vectorizers,
                      task=task, dataset=dataset)
print(language, task)
display(results)

score: 0.9711111111111111
score: 0.8266666666666667


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


score: 0.966




score: 0.8428888888888889
score: 0.783
score: 0.7743333333333333




score: 0.8408888888888889
score: 0.848
score: 0.8451111111111111
english AG


Unnamed: 0,Unnamed: 1,accuracy,f1,precision,recall
MajorityBaseline,-,0.796,0.0,0.0,0.0
bayes,frequency,0.73,0.43038,0.377778,0.5
bayes,presence,0.729,0.459082,0.387205,0.563725
linear,frequency,0.804,0.257576,0.566667,0.166667
linear,presence,0.8,0.280576,0.527027,0.191176
svm,frequency,0.802,0.238462,0.553571,0.151961
svm,presence,0.803,0.239382,0.563636,0.151961
tree,frequency,0.745,0.316354,0.349112,0.289216
tree,presence,0.744,0.340206,0.358696,0.323529


In [25]:
vectorizers = {'presence': lambda: nlp516.vectorizer.UnigramPresence(100)}
results = eval_models(classifiers=classifiers, vectorizers=vectorizers,
                      task='TR', dataset=dataset)
display(results)



score: 0.881
score: 0.8845555555555555
score: 0.851


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


score: 0.9708888888888889


Unnamed: 0,Unnamed: 1,accuracy,f1,precision,recall
MajorityBaseline,-,0.781,0.0,0.0,0.0
linear,presence,0.835,0.510386,0.728814,0.392694
svm,presence,0.822,0.44375,0.70297,0.324201
tree,presence,0.81,0.494681,0.592357,0.424658
