## Loading dataset

In [1]:
%%time
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

NONE_REPR = '__None__'                     # representation of none word - at the beginning and end of sentence

token_list = []
lemma_list = []
pos_list = []
is_negation_list = []


def add_nan():
    token_list.append(NONE_REPR)
    token_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    lemma_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    pos_list.append(NONE_REPR)
    is_negation_list.append(0)
    is_negation_list.append(0)


root = ET.parse('korpus_scraper/dataset-negiacie-z-korpusu.xml').getroot()

for sentence in root:
    add_nan()
    
    for token in sentence:
        token_list.append(token.text)
        lemma_list.append(token.attrib['lemma'])
        pos_list.append(token.attrib['pos'])
        is_negation_list.append(1 if token.attrib['pos'].endswith('-') else 0)
    
    add_nan()

df_start = pd.DataFrame.from_dict({
    'token': token_list, 
    'lemma': lemma_list, 
    'POS': pos_list, 
    'is_negation': is_negation_list,
})

# Split dataset into train and test sets
split = 0.75
train = df_start[:int(split*len(df_start))]
test = df_start[int(split*len(df_start)):]
X_train = train[['token', 'lemma', 'POS']]
X_test = test[['token', 'lemma', 'POS']]
y_train = train.is_negation
y_test = test.is_negation

CPU times: user 430 ms, sys: 111 ms, total: 541 ms
Wall time: 678 ms


## Vectorizing

In [62]:
%%time
from sklearn.feature_extraction.text import CountVectorizer


def add_prefix(prefix, iterable):
    for item in iterable:
        yield prefix + "_" + item


# learn training data vocabulary, then use it to create a document-term matrix
vect_lemma = CountVectorizer()
vect_pos = CountVectorizer()
vect_lemma.fit(X_train.lemma)
vect_pos.fit(X_train.POS)

# create list with the names of columns in dataframe
lemma_feat_names = list(vect_lemma.get_feature_names())
pos_feat_names = list(vect_pos.get_feature_names())
column_names = list(add_prefix('word1', lemma_feat_names)) + \
                list(add_prefix('word2', lemma_feat_names)) + \
                list(add_prefix('word3', lemma_feat_names)) + \
                list(add_prefix('word4', lemma_feat_names)) + \
                list(add_prefix('word5', lemma_feat_names)) + \
                list(add_prefix('word1', pos_feat_names)) + \
                list(add_prefix('word2', pos_feat_names)) + \
                list(add_prefix('word3', pos_feat_names)) + \
                list(add_prefix('word4', pos_feat_names)) + \
                list(add_prefix('word5', pos_feat_names))


def create_features_list(dataframe):
    feautures_list = []
    for index, row in dataframe.iterrows():
        if index >= dataframe.iloc[-4].name:
            break
        # LEMMA
        dtm_lemma1 = vect_lemma.transform([row['lemma']])
        dtm_lemma2 = vect_lemma.transform([dataframe.loc[index+1]['lemma']])
        dtm_lemma3 = vect_lemma.transform([dataframe.loc[index+2]['lemma']])
        dtm_lemma4 = vect_lemma.transform([dataframe.loc[index+3]['lemma']])
        dtm_lemma5 = vect_lemma.transform([dataframe.loc[index+4]['lemma']])
        # POS
        dtm_pos1 = vect_pos.transform([row['POS']])
        dtm_pos2 = vect_pos.transform([dataframe.loc[index+1]['POS']])
        dtm_pos3 = vect_pos.transform([dataframe.loc[index+2]['POS']])
        dtm_pos4 = vect_pos.transform([dataframe.loc[index+3]['POS']])
        dtm_pos5 = vect_pos.transform([dataframe.loc[index+4]['POS']])
        all_things = np.concatenate([
            dtm_lemma1.toarray()[0],
            dtm_lemma2.toarray()[0],
            dtm_lemma3.toarray()[0],
            dtm_lemma4.toarray()[0],
            dtm_lemma5.toarray()[0],
            dtm_pos1.toarray()[0],
            dtm_pos2.toarray()[0],
            dtm_pos3.toarray()[0],
            dtm_pos4.toarray()[0],
            dtm_pos5.toarray()[0],
        ])
        feautures_list.append(all_things)
    return feautures_list


def save_to_csv(filename, header_row, feautures_list):
    with open(filename, 'w') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_MINIMAL)
        wr.writerow(header_row)
        for row in feautures_list:
            wr.writerow(row)

CPU times: user 175 ms, sys: 3.24 ms, total: 179 ms
Wall time: 180 ms


In [64]:
%%time

# First we transform data into lists
X_train_features_list = create_features_list(X_train)
X_test_features_list = create_features_list(X_test)

CPU times: user 1min 28s, sys: 3.8 s, total: 1min 32s
Wall time: 1min 34s


In [65]:
%%time

# Now we save our data to csv files
save_to_csv('train.csv', column_names, X_train_features_list)
save_to_csv('test.csv', column_names, X_test_features_list)

CPU times: user 16min 35s, sys: 9.24 s, total: 16min 44s
Wall time: 16min 53s


In [67]:
%%time

# Load dataset
df_train = pd.DataFrame.from_csv('train.csv')
df_test = pd.DataFrame.from_csv('test.csv')

CPU times: user 4min 50s, sys: 30.4 s, total: 5min 20s
Wall time: 5min 33s


## Training

In [71]:
%%time
# Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB

model = MultinomialNB()

# Train the model using the training sets 
model.fit(df_train, y_train[:-4])

CPU times: user 3.95 s, sys: 5.66 s, total: 9.61 s
Wall time: 9.84 s


## Metrics

In [72]:
from sklearn import metrics

y_pred_class = model.predict(df_test)
acc = metrics.accuracy_score(y_test[:-4], y_pred_class)

print('Accuracy: {}'.format(acc))

Accuracy: 0.9649880095923261


## Playground

In [53]:
y_train[5]
new[5]

array([0, 0, 0, ..., 0, 0, 0])

In [56]:
#Predict Output 
predicted = model.predict([new[2]])
print(predicted)

[0]
