In [44]:
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [30]:
# constants
MAPS = {"no": 0, "yes": 1}
MIN_COUNT = 1
VECTOR_SIZE = 100
WINDOW = 5
SG = 1
EPOCHS = 20
N_NEIGHBORS = 20

In [13]:
# read train data and test data + mapping label
# vectorizer = CountVectorizer(token_pattern=TOKEN_PATTERN, min_df=MIN_DF)

data_train = pd.read_csv("../../data_worthcheck/train.csv")
x_train = data_train['text_a'].apply(lambda x: simple_preprocess(x))
y_train = data_train['label'].replace(MAPS)

data_test = pd.read_csv("../../data_worthcheck/test.csv")
x_test = data_test['text_a'].apply(lambda x: simple_preprocess(x))
y_test = data_test['label'].replace(MAPS)

In [51]:
# create word2vec model
start = time.time()
model_word2vec = Word2Vec(
    x_train,
    min_count=MIN_COUNT,
    vector_size=VECTOR_SIZE,
    window=WINDOW
)
end = time.time()
print("elapsed time: {}".format(end - start))

elapsed time: 4.3232080936431885


In [50]:
# create skip-gram model
start = time.time()
model_skipgram = Word2Vec(
    x_train,
    min_count=MIN_COUNT,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    sg=SG
)
end = time.time()
print("elapsed time: {}".format(end - start))

elapsed time: 8.417657613754272


In [53]:
# training models
start = time.time()
model_word2vec.train(x_train, total_examples=len(x_train), epochs=EPOCHS) # CBOW
end = time.time()
print("elapsed time: {}".format(end - start))

elapsed time: 9.490114688873291


In [52]:
start = time.time()
model_skipgram.train(x_train, total_examples=len(x_train), epochs=EPOCHS)
end = time.time()
print("elapsed time: {}".format(end - start))

elapsed time: 30.377943515777588


In [35]:
# predict with word2vec model
x_train_word2vec = np.array([np.mean([model_word2vec.wv[word] for word in words if word in model_word2vec.wv.key_to_index] or [np.zeros(VECTOR_SIZE)], axis=0) for words in x_train])
x_test_word2vec = np.array([np.mean([model_word2vec.wv[word] for word in words if word in model_word2vec.wv.key_to_index] or [np.zeros(VECTOR_SIZE)], axis=0) for words in x_test])

# predict with skip-gram model
x_train_skipgram = np.array([np.mean([model_skipgram.wv[word] for word in words if word in model_skipgram.wv.key_to_index] or [np.zeros(VECTOR_SIZE)], axis=0) for words in x_train])
x_test_skipgram = np.array([np.mean([model_skipgram.wv[word] for word in words if word in model_skipgram.wv.key_to_index] or [np.zeros(VECTOR_SIZE)], axis=0) for words in x_test])

In [38]:
# create KNN model
knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)

# training KNN model
knn_word2vec = knn.fit(x_train_word2vec, y_train)
knn_skipgram = knn.fit(x_train_skipgram, y_train)

# predict with KNN model
y_pred_word2vec = knn_word2vec.predict(x_test_word2vec)
y_pred_skipgram = knn_skipgram.predict(x_test_skipgram)

# evaluate model
precision = precision_score(y_test, y_pred_word2vec)
recall = recall_score(y_test, y_pred_word2vec)
f1 = f1_score(y_test, y_pred_word2vec)
accuracy = accuracy_score(y_test, y_pred_word2vec)

print("Word2Vec")
print(confusion_matrix(y_test, y_pred_word2vec))
print('''Precision: {:.4f}
Recall: {:.4f}
F1 Score: {:.4f}
Accuracy: {:.4f}\n'''.format(
    precision, recall, f1, accuracy))

precision = precision_score(y_test, y_pred_skipgram)
recall = recall_score(y_test, y_pred_skipgram)
f1 = f1_score(y_test, y_pred_skipgram)
accuracy = accuracy_score(y_test, y_pred_skipgram)

print("Skip-gram")
print(confusion_matrix(y_test, y_pred_skipgram))
print('''Precision: {:.4f}
Recall: {:.4f}
F1 Score: {:.4f}
Accuracy: {:.4f}'''.format(
    precision, recall, f1, accuracy))

Word2Vec
[[1958  135]
 [ 351  356]]
Precision: 0.7251
Recall: 0.5035
F1 Score: 0.5943
Accuracy: 0.8264

Skip-gram
[[1917  176]
 [ 247  460]]
Precision: 0.7233
Recall: 0.6506
F1 Score: 0.6850
Accuracy: 0.8489
