## Loading dataset

In [1]:
%%time
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

NONE_REPR = '__None__'                     # representation of none word - at the beginning and end of sentence

token_list = []
lemma_list = []
pos_list = []
is_negation_list = []

def add_NaN():
    token_list.append(NONE_REPR)
    token_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    is_negation_list.append(0)
    is_negation_list.append(0)


root = ET.parse('korpus_scraper/dataset-negiacie-z-korpusu.xml').getroot()

for sentence in root:
    add_NaN()
    
    for token in sentence:
        token_list.append(token.text)
        lemma_list.append(token.attrib['lemma'])
        pos_list.append(token.attrib['pos'])
        is_negation_list.append(1 if token.attrib['pos'].endswith('-') else 0)
    
    add_NaN()

df = pd.DataFrame.from_dict({
    'token': token_list, 
    'lemma': lemma_list, 
    'POS': pos_list, 
    'is_negation': is_negation_list,
})

CPU times: user 445 ms, sys: 133 ms, total: 578 ms
Wall time: 832 ms


In [2]:
# Split dataset into train and test sets
split = 0.75
train = df[:int(split*len(df))]
test = df[int(split*len(df)):]
X_train = train[['token', 'lemma', 'POS']]
X_test = test[['token', 'lemma', 'POS']]
y_train = train.is_negation
y_test = test.is_negation

## Working with dataset

In [3]:
X_train.describe()

Unnamed: 0,token,lemma,POS
count,18776,18776,18776
unique,6229,4294,562
top,__None__,__None__,__None__
freq,2962,2962,2962


In [4]:
df.isnull().sum()

POS            0
is_negation    0
lemma          0
token          0
dtype: int64

## Vectorizing

In [5]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train.lemma)
X_train_lemma_dtm = vect.transform(X_train.lemma)

In [6]:
vect.get_feature_names()

['10',
 '1000',
 '103',
 '112',
 '12',
 '135',
 '1415',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1912',
 '1948',
 '1969',
 '1990',
 '2000',
 '2002',
 '2007',
 '2009',
 '27',
 '28',
 '29',
 '30',
 '300',
 '31',
 '49',
 '50',
 '500',
 '5000',
 '529',
 '62',
 '80',
 '86',
 '89',
 '92',
 '94',
 '96',
 '99',
 '__none__',
 'abrosimov',
 'absolútne',
 'absolútny',
 'aby',
 'adekvátny',
 'adelaide',
 'adeptka',
 'adolescent',
 'adresa',
 'advokát',
 'adámika',
 'afinér',
 'afp',
 'agent',
 'agentúra',
 'agresívny',
 'aj',
 'ak',
 'akadémia',
 'akceptovať',
 'akcia',
 'akciový',
 'ako',
 'akoby',
 'akokoľvek',
 'akonáhle',
 'akosi',
 'akreditácia',
 'aktivista',
 'aktivistka',
 'aktivita',
 'aktívny',
 'akútny',
 'aký',
 'akýkoľvek',
 'akýsi',
 'ale',
 'alebo',
 'alena',
 'alex',
 'alexander',
 'alfonz',
 'aliancia',
 'alica',
 'ambulancia',
 'ambícia',
 'americký',
 'amerika',
 'američan',
 'amp',
 'analýza',
 'andrassy',
 'andrew',
 'android',
 'angažovať',
 'anglicky',
 'anglický',
 'ani',
 'ank

In [7]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<3x4250 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [17]:
X_train_lemma_dtm.todense().shape
count_vect_df = pd.DataFrame(X_train_lemma_dtm.todense(), columns=vect.get_feature_names())

In [18]:
count_vect_df

Unnamed: 0,10,1000,103,112,12,135,1415,15,16,17,...,živel,život,životabudič,životný,živočíšny,živý,žiť,žlto,žobrať,žrebčín
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training

In [8]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np

#assigning predictor and target variables
#x = np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
#Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

In [9]:
#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets 
model.fit(x, y)

#Predict Output 
predicted = model.predict([['nevyplýva', 'nevyplývať', 'VKesc-']])
print(predicted)

NameError: name 'x' is not defined