In [None]:
import re
import nlp516
import nlp516.model
import nltk
import numpy as np
import pandas as pd
import sklearn
from types import SimpleNamespace
from sklearn.feature_extraction.text import CountVectorizer

## Dataset

In [None]:
#dataset = nlp516.data.PublicTrialRaw()
language = 'english'
if language == 'spanish':
    raw = nlp516.data.PublicSpanishDataset()
elif language=='english':
    raw = nlp516.data.PublicEnglishDataset()

# Preprocessing

In [None]:
tokenizer_map = nlp516.data.Tokenizer('english')
stopwords_map = nlp516.data.RemoveStopWords(language)
stemmer_map = nlp516.data.Stemmer(language)

def preprocess(dataset):
    def run(data):
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_urls_map)
        #en = nlp516.data.map_column(dataset.en, 'text', nlp516.data.casual_tokenize_map)
        data = nlp516.data.map_column(data, 'text', tokenizer_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.user_camelcase_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.hashtag_camelcase_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.to_lowercase)
        data = nlp516.data.map_column(data, 'text', stopwords_map)
        #data = nlp516.data.map_column(data, 'text', stemmer_map)
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_words_with_numbers)
        data = nlp516.data.map_column(data, 'text', nlp516.data.remove_punctuation)
        return data
    return SimpleNamespace(train = run(dataset.train),
                           valid = run(dataset.valid))
dataset = preprocess(raw)

In [None]:
print('Original: {}'.format(raw.train.iloc[25].text))
print('Tokens: {}'.format(dataset.train.iloc[25].text))

In [None]:
print('Original: {}'.format(raw.train.iloc[1].text))
print('Tokens: {}'.format(dataset.train.iloc[1].text))

In [None]:
print('Original: {}'.format(raw.train.iloc[26].text))
print('Tokens: {}'.format(dataset.train.iloc[26].text))

## Train

In [None]:
#def random_shuffle(data):
#    idx = np.arange(en.shape[0])
#    shuffle = np.random.shuffle(idx)
#    return data.iloc[idx, :]
#dataset = random_shuffle(en)
train = dataset.train
test = dataset.valid

In [None]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, dataset.train.HS)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.HS)))

In [None]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, train.TR)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.TR)))

In [None]:
model = nlp516.model.SVMModel(100)
model.fit(dataset.train.text, dataset.train.AG)
print('test: {}'.format(model.score(dataset.valid.text, dataset.valid.AG)))

In [None]:
#[model.vectorizer.id2word(id) for id in model.vectorizer.feature_id]

In [None]:
model = nlp516.model.MajorityBaseline()

In [None]:
model.fit(dataset.train.text, dataset.train.AG)

In [None]:
model.precision_score(dataset.train.text, dataset.train.AG)

In [None]:
import sklearn.tree
import sklearn.naive_bayes

In [None]:
import itertools
def subtask_dataset(dataset, task):
    train = SimpleNamespace(x=dataset.train.text,
                                y=getattr(dataset.train, task))
    valid = SimpleNamespace(x=dataset.valid.text,
                            y=getattr(dataset.valid, task))
    return SimpleNamespace(train=train, valid=valid)

def eval_metrics(model, dataset):
    model.fit(dataset.train.x, dataset.train.y)
    return {'accuracy': model.score(dataset.valid.x, dataset.valid.y),
            'precision': model.precision_score(dataset.valid.x, dataset.valid.y),
            'recall': model.recall_score(dataset.valid.x, dataset.valid.y),
            'f1': model.f1_score(dataset.valid.x, dataset.valid.y)}

def instantiate_models(classifiers, vectorizers):
    models = {('MajorityBaseline', '-'): nlp516.model.MajorityBaseline()}
    models.update(
        {(c, v): nlp516.model.MlModel(classifier=classifiers[c](), 
                                      vectorizer=vectorizers[v]())
         for c, v in itertools.product(classifiers.keys(), vectorizers.keys())
        }
    )
    return models

def eval_models(classifiers, vectorizers, task, dataset):
    models = instantiate_models(classifiers, vectorizers)
    results = {key: eval_metrics(model, dataset=subtask_dataset(dataset, task))
               for key, model in models.items()}
    return pd.DataFrame(results).transpose()



In [None]:
task = 'AG'
classifiers={'linear': lambda: sklearn.linear_model.LogisticRegression(),
             'svm': lambda: sklearn.svm.SVC(gamma='scale'),
             'tree': lambda: sklearn.tree.DecisionTreeClassifier(),
             'bayes': lambda: sklearn.naive_bayes.GaussianNB()}
vectorizers = {'frequency': lambda: nlp516.vectorizer.Unigram(100),
               'presence': lambda: nlp516.vectorizer.UnigramPresence(100)}

results = eval_models(classifiers=classifiers, vectorizers=vectorizers,
                      task=task, dataset=dataset)
print(language, task)
display(results)

In [None]:
vectorizers = {'presence': lambda: nlp516.vectorizer.UnigramPresence(100)}
results = eval_models(classifiers=classifiers, vectorizers=vectorizers,
                      task='TR', dataset=dataset)
display(results)