In [50]:
%%time
import pandas as pd
import xml.etree.ElementTree as ET

# Import dataset and convert it to Pandas dataframe
root = ET.parse('korpus_scraper/dataset-negiacie-z-korpusu.xml').getroot()

df = pd.DataFrame.from_records(
    [(token.text,
      1 if token.text.startswith('ne') else 0, 
      1 if token.attrib['pos'].endswith('-') else 0) for token in root.iter('token')],
    columns=('token', 'starts_with_ne', 'is_negation')
)

# Split to train and test
split = 0.8
train = df[:int(split*len(df))]
test = df[int((1-split)*len(df)):]

CPU times: user 180 ms, sys: 8 ms, total: 188 ms
Wall time: 187 ms


In [51]:
df.head()

Unnamed: 0,token,starts_with_ne,is_negation
0,Pretože,0,0
1,Marekovo,0,0
2,správanie,0,0
3,nebolo,1,1
4,prejavom,0,0


In [35]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = GaussianNB()
X = train[['starts_with_ne']]
y = train['is_negation']

model.fit(X, y)

GaussianNB()

In [37]:
p = model.predict([[1], [0], [0]])
p

array([1, 0, 0])

In [45]:
X_test = test[['starts_with_ne']]
y_test = test['is_negation']
y_pred_class = model.predict(X_test)

print(y_pred_class)

[1 0 0 ..., 0 1 0]


In [46]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.98472874205240957

In [47]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[15669,   156],
       [  101,   903]])

In [48]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[y_test < y_pred_class]

Unnamed: 0,starts_with_ne
4227,1
4263,1
4299,1
4303,1
4351,1
4385,1
4431,1
4463,1
4510,1
4512,1


In [56]:
# row by index
df.iloc[[4227]]

Unnamed: 0,token,starts_with_ne,is_negation
4227,nezabudnuteľný,1,0
