In [1]:
with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    labels = [int(l) for l in f.readlines() if l[0].isdigit()]

docs = []
with open("./data/raw/Hygiene/hygiene.dat") as f:
    for i in range(len(labels)):
        docs.append(f.readline().strip("\n"))

with open("./data/processed/train.txt", "w") as f:
    for i, _doc in enumerate(docs[0:int(len(docs)*0.8)]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")

with open("./data/processed/dev.txt", "w") as f:
    for i, _doc in enumerate(docs[int(len(docs)*0.8):int(len(docs)*0.9)]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")
        
with open("./data/processed/test.txt", "w") as f:
    for i, _doc in enumerate(docs[int(len(docs)*0.9):]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")

In [48]:
import pandas as pd

train_fasttext = pd.DataFrame()
train_df = pd.read_csv("~/Downloads/dbpedia_csv/train.csv", header=None)

train_fasttext['text'] = train_df.iloc[:,2]
train_fasttext['label'] = ["__label__" + str(i) for i in train_df.iloc[:,0]]
train_fasttext.to_csv("~/Downloads/dbpedia_csv/train_fasttext.csv", header = False, index = False, sep = ' ')

In [49]:
test_fasttext = pd.DataFrame()
test_df = pd.read_csv("~/Downloads/dbpedia_csv/test.csv", header=None)

test_fasttext['text'] = test_df.iloc[:,2]
test_fasttext['label'] = ["__label__" + str(i) for i in test_df.iloc[:,0]]
test_fasttext.to_csv("~/Downloads/dbpedia_csv/test_fasttext.csv", header = False, index = False,  sep = ' ')

In [72]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)


with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(i) for i in f.readlines()[0:546]]

LABELED_STEMMED_TEXTS = []
with open("./data/raw/Hygiene/hygiene.dat") as f:
    for i in tqdm(range(len(LABELS))):
        _text = f.readline()
        _result, _result_stemmed = preprocess(_text)
        LABELED_STEMMED_TEXTS.append(_result_stemmed)
        
train_text = LABELED_STEMMED_TEXTS[0:400]
train_label = LABELS[0:400]
test_text = LABELED_STEMMED_TEXTS[400:len(LABELS)]
test_label = LABELS[400:len(LABELS)]

100%|██████████| 546/546 [00:10<00:00, 48.46it/s]


In [67]:
_dictionary = corpora.Dictionary(train_text)
train_count = [_dictionary.doc2bow(_doc) for _doc in train_text]
test_count = [_dictionary.doc2bow(_doc) for _doc in test_text]

tfidf_model = models.TfidfModel(train_count)
train_tfidf = [_doc for _doc in tfidf_model[train_count]]
test_tfidf = [_doc for _doc in tfidf_model[test_count]]

In [73]:
labeled_train_count = [(dict(_doc), train_label[i]) for i, _doc in enumerate(train_count)]
labeled_test_count = [dict(_doc) for _doc in test_count]

labeled_train_tfidf = [(dict(_doc), train_label[i]) for i, _doc in enumerate(train_tfidf)]
labeled_test_tfidf = [dict(_doc) for _doc in test_tfidf]

In [75]:
from sklearn import metrics
from nltk.classify import NaiveBayesClassifier

NB_classifier_count = NaiveBayesClassifier.train(labeled_train_count)

predictions = NB_classifier_count.classify_many(labeled_test_count)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[18  7]
 [62 59]]
0.5273972602739726 0.8939393939393939 0.48760330578512395 0.6310160427807486


In [77]:
from nltk.classify import DecisionTreeClassifier

DT_classifier_count = DecisionTreeClassifier.train(labeled_train_count)
predictions = DT_classifier_count.classify_many(labeled_test_count)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[43 41]
 [37 25]]
0.4657534246575342 0.3787878787878788 0.4032258064516129 0.39062499999999994


In [83]:
from nltk.classify.maxent import TypedMaxentFeatureEncoding, MaxentClassifier

encoding = TypedMaxentFeatureEncoding.train(labeled_train_tfidf, 
                                                   count_cutoff=3, 
                                                   alwayson_features=True)
Maxent_classifier_count = MaxentClassifier.train(labeled_train_tfidf, 
                                           bernoulli=False, 
                                           encoding=encoding, 
                                           trace=0)

predictions = Maxent_classifier_count.classify_many(labeled_test_tfidf)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[ 0  0]
 [80 66]]
0.4520547945205479 1.0 0.4520547945205479 0.6226415094339622


In [3]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./data/processed'), 
                                                       test_file='test.txt', 
                                                       dev_file='dev.txt', 
                                                       train_file='train.txt')
word_embeddings = [WordEmbeddings('glove'), 
                   FlairEmbeddings('news-forward-fast'), 
                   FlairEmbeddings('news-backward-fast')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, 
                            label_dictionary = corpus.make_label_dictionary(), 
                            multi_label=False)
trainer = ModelTrainer(classifier, corpus)
# trainer.train('./', max_epochs=10)
trainer.train('./data/model',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

  # This is added back by InteractiveShellApp.init_path()


2019-07-20 11:34:49,351 Reading data from data\processed
2019-07-20 11:34:49,353 Train: data\processed\train.txt
2019-07-20 11:34:49,355 Dev: data\processed\dev.txt
2019-07-20 11:34:49,356 Test: data\processed\test.txt


  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,
  max_tokens_per_doc=max_tokens_per_doc,


2019-07-20 11:35:07,820 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to C:\Users\geesi\AppData\Local\Temp\tmp55wazxz2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 160000128/160000128 [08:53<00:00, 299743.68B/s]


2019-07-20 11:44:03,323 copying C:\Users\geesi\AppData\Local\Temp\tmp55wazxz2 to cache at C:\Users\geesi\.flair\embeddings\glove.gensim.vectors.npy
2019-07-20 11:44:03,546 removing temp file C:\Users\geesi\AppData\Local\Temp\tmp55wazxz2
2019-07-20 11:44:05,366 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to C:\Users\geesi\AppData\Local\Temp\tmppzpjk47q


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21494764/21494764 [01:47<00:00, 200485.57B/s]


2019-07-20 11:45:54,267 copying C:\Users\geesi\AppData\Local\Temp\tmppzpjk47q to cache at C:\Users\geesi\.flair\embeddings\glove.gensim
2019-07-20 11:45:54,322 removing temp file C:\Users\geesi\AppData\Local\Temp\tmppzpjk47q
2019-07-20 11:45:54,327 this function is deprecated, use smart_open.open instead
2019-07-20 11:45:56,636 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to C:\Users\geesi\AppData\Local\Temp\tmpyq5_mlws


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19689779/19689779 [01:09<00:00, 285083.93B/s]


2019-07-20 11:47:07,245 copying C:\Users\geesi\AppData\Local\Temp\tmpyq5_mlws to cache at C:\Users\geesi\.flair\embeddings\lm-news-english-forward-1024-v0.2rc.pt
2019-07-20 11:47:07,290 removing temp file C:\Users\geesi\AppData\Local\Temp\tmpyq5_mlws
2019-07-20 11:47:15,050 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to C:\Users\geesi\AppData\Local\Temp\tmptx5hmvus


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19689779/19689779 [01:05<00:00, 299751.78B/s]


2019-07-20 11:48:22,304 copying C:\Users\geesi\AppData\Local\Temp\tmptx5hmvus to cache at C:\Users\geesi\.flair\embeddings\lm-news-english-backward-1024-v0.2rc.pt
2019-07-20 11:48:22,363 removing temp file C:\Users\geesi\AppData\Local\Temp\tmptx5hmvus


  app.launch_new_instance()


2019-07-20 11:48:22,474 {'0', '1'}
2019-07-20 11:48:22,481 ----------------------------------------------------------------------------------------------------
2019-07-20 11:48:22,482 Evaluation method: MICRO_F1_SCORE
2019-07-20 11:48:25,960 ----------------------------------------------------------------------------------------------------


RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 8.00 GiB total capacity; 6.13 GiB already allocated; 57.52 MiB free; 25.69 MiB cached)

In [None]:
from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves('./data/modelloss.tsv')
plotter.plot_weights('./data/model/weights.txt')

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load_from_file('./data/model/best-model.pt')
sentence = Sentence(" ".join(test_text[0]))
classifier.predict(sentence)
print(sentence.labels)