## Text preprocessing

In [26]:
%%time
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

NONE_REPR = np.nan                     # representation of none word - at the beginning and end of sentence

token_list = []
lemma_list = []
pos_list = []
is_negation_list = []

def add_NaN():
    token_list.append(NONE_REPR)
    token_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    is_negation_list.append(0)
    is_negation_list.append(0)


root = ET.parse('korpus_scraper/dataset-negiacie-z-korpusu.xml').getroot()

for sentence in root:
    add_NaN()
    
    for token in sentence:
        token_list.append(token.text)
        lemma_list.append(token.attrib['lemma'])
        pos_list.append(token.attrib['pos'])
        is_negation_list.append(1 if token.attrib['pos'].endswith('-') else 0)
    
    add_NaN()

df = pd.DataFrame.from_dict({
    'token': token_list, 
    'lemma': lemma_list, 
    'POS': pos_list, 
    'is_negation': is_negation_list,
})

CPU times: user 224 ms, sys: 4 ms, total: 228 ms
Wall time: 234 ms


In [35]:
# Split dataset into train and test sets
split = 0.75
train = df[:int(split*len(df))]
test = df[int(split*len(df)):]
X_train = train[['token', 'lemma', 'POS']]
X_test = test[['token', 'lemma', 'POS']]
y_train = train.is_negation
y_test = test.is_negation

## Working with dataset

In [6]:
X_train

Unnamed: 0,token,lemma,pos
7595,Discovery,discovera,SSfs2
3032,mediálnej,mediálny,AAfs2x
4706,.,.,Z
9258,",",",",Z
9241,odmieta,odmietať,VKesc+
18656,o,o,Eu4
3190,mestu,mesto,SSns3
249,",",",",Z
19959,tým,to,PFns7
17379,18,18,0


In [7]:
X_train.describe()

Unnamed: 0,token,lemma,pos
count,15776,15776,15776
unique,6261,4313,556
top,",",",",Z
freq,1318,1318,2649


In [6]:
df.isnull().sum()

token          0
lemma          0
pos            0
is_negation    0
dtype: int64

## Vectorizing

In [8]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [10]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

# examine the document-term matrix
X_train_dtm

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [11]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

## Training

In [24]:
#Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np

#assigning predictor and target variables
#x = np.array([[-3,7],[1,5], [1,2], [-2,0], [2,3], [-4,0], [-1,1], [1,1], [-2,2], [2,7], [-4,1], [-2,7]])
#Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4])

In [25]:
#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets 
model.fit(x, y)

#Predict Output 
predicted = model.predict([['nevyplýva', 'nevyplývať', 'VKesc-']])
print(predicted)

ValueError: could not convert string to float: 'Z'