## Loading dataset

In [3]:
%%time
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

NONE_REPR = '__None__'                     # representation of none word - at the beginning and end of sentence

token_list = []
lemma_list = []
pos_list = []
is_negation_list = []

def add_NaN():
    token_list.append(NONE_REPR)
    token_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    is_negation_list.append(0)
    is_negation_list.append(0)


root = ET.parse('korpus_scraper/dataset-negiacie-z-korpusu.xml').getroot()

for sentence in root:
    add_NaN()
    
    for token in sentence:
        token_list.append(token.text)
        lemma_list.append(token.attrib['lemma'])
        pos_list.append(token.attrib['pos'])
        is_negation_list.append(1 if token.attrib['pos'].endswith('-') else 0)
    
    add_NaN()

df = pd.DataFrame.from_dict({
    'token': token_list, 
    'lemma': lemma_list, 
    'POS': pos_list, 
    'is_negation': is_negation_list,
})

# Split dataset into train and test sets
split = 0.75
train = df[:int(split*len(df))]
test = df[int(split*len(df)):]
X_train = train[['token', 'lemma', 'POS']]
X_test = test[['token', 'lemma', 'POS']]
y_train = train.is_negation
y_test = test.is_negation

CPU times: user 510 ms, sys: 139 ms, total: 649 ms
Wall time: 885 ms


## Vectorizing

In [5]:
%%time

# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect_lemma = CountVectorizer()
vect_pos = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
vect_lemma.fit(X_train.lemma)
vect_pos.fit(X_train.POS)

# Prechadzam cely df a tvorim novy

new = []

for index, row in X_train.iterrows():
    dtm_lemma1 = vect_lemma.transform([row['lemma']])
    dtm_pos1 = vect_pos.transform([row['POS']])
    all_things = np.append(dtm_lemma1.toarray(), [dtm_pos1.toarray()])
    new.append(all_things)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [46]:
%%time

# transformuje list do dataframe
column_names = vect_lemma.get_feature_names() + vect_pos.get_feature_names()
df_new = pd.DataFrame(new, columns=column_names)

In [71]:
# Snaha je spojit dva riadky z dataframe

def add_prefix(prefix, iterable):
    for item in iterable:
        yield prefix + "_" + item

l = list(count_vect_df.iloc[0]) + list(count_vect_df.iloc[1])
c = list(add_prefix('slovo1', vect.get_feature_names())) + list(add_prefix('slovo2', vect.get_feature_names()))
pd.DataFrame([l], columns=c)

Unnamed: 0,slovo1_10,slovo1_1000,slovo1_103,slovo1_112,slovo1_12,slovo1_135,slovo1_1415,slovo1_15,slovo1_16,slovo1_17,...,slovo2_živel,slovo2_život,slovo2_životabudič,slovo2_životný,slovo2_živočíšny,slovo2_živý,slovo2_žiť,slovo2_žlto,slovo2_žobrať,slovo2_žrebčín
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
row1 = count_vect_df.iloc[[0]]
row2 = count_vect_df.iloc[[1]]
row1.columns = add_prefix('slovo1', row1.columns)
row2.columns = add_prefix('slovo2', row2.columns)
pd.concat([row1, row2], axis=1)

Unnamed: 0,slovo1_10,slovo1_1000,slovo1_103,slovo1_112,slovo1_12,slovo1_135,slovo1_1415,slovo1_15,slovo1_16,slovo1_17,...,slovo2_živel,slovo2_život,slovo2_životabudič,slovo2_životný,slovo2_živočíšny,slovo2_živý,slovo2_žiť,slovo2_žlto,slovo2_žobrať,slovo2_žrebčín
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training

In [51]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np

#assigning predictor and target variables
#x = np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
#Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets 
model.fit(df_new, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
y_train[5]
new[5]

array([0, 0, 0, ..., 0, 0, 0])

In [56]:
#Predict Output 
predicted = model.predict([new[2]])
print(predicted)

[0]
