In [9]:
with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    labels = [int(l) for l in f.readlines() if l[0].isdigit()]

docs = []
with open("./data/raw/Hygiene/hygiene.dat") as f:
    for i in range(len(labels)):
        docs.append(f.readline().strip("\n"))

with open("./data/processed/train.txt", "w") as f:
    for i, _doc in enumerate(docs[0:int(len(docs)*0.8)]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")

with open("./data/processed/dev.txt", "w") as f:
    for i, _doc in enumerate(docs[int(len(docs)*0.8):int(len(docs)*0.9)]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")
        
with open("./data/processed/test.txt", "w") as f:
    for i, _doc in enumerate(docs[int(len(docs)*0.9):]):
        f.write("__label__" + str(labels[i]) + " " + _doc + "\n")

### For Bert

In [42]:
import pandas as pd

df_bert_train = pd.DataFrame({'user_id':0,
                              'label':labels[0:int(len(docs)*0.8)],
                              'alpha': 0,
                             'text':docs[0:int(len(docs)*0.8)]})
df_bert_train['text'] = df_bert_train['text'].replace(r'\n', ' ', regex=True).replace(r',', ' ', regex=True)

df_bert_dev = pd.DataFrame({'user_id':0,
                            'label':labels[int(len(docs)*0.8):int(len(docs)*0.9)],
                            'alpha': 0,
                           'text':docs[int(len(docs)*0.8):int(len(docs)*0.9)]})
df_bert_dev['text'] = df_bert_dev['text'].replace(r'\n', ' ', regex=True).replace(r',', ' ', regex=True)

df_bert_test = pd.DataFrame({'user_id':0,
                             'label':labels[int(len(docs)*0.9):],
                             'alpha': 0,
                            'text':docs[int(len(docs)*0.9):]})
df_bert_test['text'] = df_bert_test['text'].replace(r'\n', ' ', regex=True).replace(r',', ' ', regex=True)

df_bert_train.to_csv('./data/processed/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('./data/processed/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('./data/processed/test.tsv', sep='\t', index=False, header=True)

### For FastText

In [48]:
import pandas as pd

train_fasttext = pd.DataFrame()
train_df = pd.read_csv("~/Downloads/dbpedia_csv/train.csv", header=None)

train_fasttext['text'] = train_df.iloc[:,2]
train_fasttext['label'] = ["__label__" + str(i) for i in train_df.iloc[:,0]]
train_fasttext.to_csv("~/Downloads/dbpedia_csv/train_fasttext.csv", header = False, index = False, sep = ' ')

In [49]:
test_fasttext = pd.DataFrame()
test_df = pd.read_csv("~/Downloads/dbpedia_csv/test.csv", header=None)

test_fasttext['text'] = test_df.iloc[:,2]
test_fasttext['label'] = ["__label__" + str(i) for i in test_df.iloc[:,0]]
test_fasttext.to_csv("~/Downloads/dbpedia_csv/test_fasttext.csv", header = False, index = False,  sep = ' ')

In [72]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from tqdm import tqdm
import numpy as np
import pandas as pd


stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    result_stemmed = []
    for token in simple_preprocess(text, min_len = 2):
        result.append(token)
        if token not in STOPWORDS:
            result_stemmed.append(lemmatize_stemming(token))
    
    return (result, result_stemmed)


with open("./data/raw/Hygiene/hygiene.dat.labels") as f:
    LABELS = [int(i) for i in f.readlines()[0:546]]

LABELED_STEMMED_TEXTS = []
with open("./data/raw/Hygiene/hygiene.dat") as f:
    for i in tqdm(range(len(LABELS))):
        _text = f.readline()
        _result, _result_stemmed = preprocess(_text)
        LABELED_STEMMED_TEXTS.append(_result_stemmed)
        
train_text = LABELED_STEMMED_TEXTS[0:400]
train_label = LABELS[0:400]
test_text = LABELED_STEMMED_TEXTS[400:len(LABELS)]
test_label = LABELS[400:len(LABELS)]

100%|██████████| 546/546 [00:10<00:00, 48.46it/s]


In [67]:
_dictionary = corpora.Dictionary(train_text)
train_count = [_dictionary.doc2bow(_doc) for _doc in train_text]
test_count = [_dictionary.doc2bow(_doc) for _doc in test_text]

tfidf_model = models.TfidfModel(train_count)
train_tfidf = [_doc for _doc in tfidf_model[train_count]]
test_tfidf = [_doc for _doc in tfidf_model[test_count]]

In [73]:
labeled_train_count = [(dict(_doc), train_label[i]) for i, _doc in enumerate(train_count)]
labeled_test_count = [dict(_doc) for _doc in test_count]

labeled_train_tfidf = [(dict(_doc), train_label[i]) for i, _doc in enumerate(train_tfidf)]
labeled_test_tfidf = [dict(_doc) for _doc in test_tfidf]

In [75]:
from sklearn import metrics
from nltk.classify import NaiveBayesClassifier

NB_classifier_count = NaiveBayesClassifier.train(labeled_train_count)

predictions = NB_classifier_count.classify_many(labeled_test_count)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[18  7]
 [62 59]]
0.5273972602739726 0.8939393939393939 0.48760330578512395 0.6310160427807486


In [77]:
from nltk.classify import DecisionTreeClassifier

DT_classifier_count = DecisionTreeClassifier.train(labeled_train_count)
predictions = DT_classifier_count.classify_many(labeled_test_count)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[43 41]
 [37 25]]
0.4657534246575342 0.3787878787878788 0.4032258064516129 0.39062499999999994


In [83]:
from nltk.classify.maxent import TypedMaxentFeatureEncoding, MaxentClassifier

encoding = TypedMaxentFeatureEncoding.train(labeled_train_tfidf, 
                                                   count_cutoff=3, 
                                                   alwayson_features=True)
Maxent_classifier_count = MaxentClassifier.train(labeled_train_tfidf, 
                                           bernoulli=False, 
                                           encoding=encoding, 
                                           trace=0)

predictions = Maxent_classifier_count.classify_many(labeled_test_tfidf)

print(metrics.confusion_matrix(predictions, test_label))
print(metrics.accuracy_score(predictions, test_label),
            metrics.precision_score(predictions, test_label),
            metrics.recall_score(predictions, test_label),
            metrics.f1_score(predictions, test_label))

[[ 0  0]
 [80 66]]
0.4520547945205479 1.0 0.4520547945205479 0.6226415094339622


In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./data/processed'), 
                                                       test_file='test.txt', 
                                                       dev_file='dev.txt', 
                                                       train_file='train.txt')
word_embeddings = [WordEmbeddings('glove'), 
                   FlairEmbeddings('news-forward-fast'), 
                   FlairEmbeddings('news-backward-fast')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, 
                            label_dictionary = corpus.make_label_dictionary(), 
                            multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)

2019-07-19 19:04:23,620 Reading data from data/processed
2019-07-19 19:04:23,623 Train: data/processed/train.txt
2019-07-19 19:04:23,625 Dev: data/processed/dev.txt
2019-07-19 19:04:23,627 Test: data/processed/test.txt


  # This is added back by InteractiveShellApp.init_path()
  max_tokens_per_doc=max_tokens_per_doc,


In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

classifier = TextClassifier.load_from_file('./best-model.pt')
sentence = Sentence(" ".join(test_text[0]))
classifier.predict(sentence)
print(sentence.labels)