In [1]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Load text data from CSV file
data = pd.read_csv('csv/train.csv')

# Preprocess text data (tokenization, lowercasing)
data['Text'] = data['Text'].apply(lambda x: x.lower().split())  # Tokenization and lowercasing

# Build Word2Vec vocabulary
word2vec_model = Word2Vec(sentences=data['Text'].to_list(), vector_size=100, window=5, min_count=1, workers=4)
#word2vec_model.build_vocab(data['Text'])

# Train Word2Vec model
word2vec_model.train(data['Text'], total_examples=word2vec_model.corpus_count, epochs=10)  # Train the model

# Generate Word2Vec embeddings for each document
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0
    for word in words:
        if word in vocabulary:
            nwords += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

X = [average_word_vectors(words, word2vec_model, word2vec_model.wv.index_to_key, 100) for words in data['Text']]
y = data['Label']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression using Word2Vec embeddings
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predict on test set
y_pred = classifier.predict(X_test)

# Calculate accurary
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7758101142712672


In [2]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')


Precision: 0.7758
Recall: 0.7758
F1-score: 0.7758


In [3]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

  BACKGROUND       0.56      0.43      0.49     38306
 CONCLUSIONS       0.67      0.71      0.69     67498
     METHODS       0.83      0.88      0.85    143047
   OBJECTIVE       0.64      0.58      0.61     37057
     RESULTS       0.85      0.84      0.85    153135

    accuracy                           0.78    439043
   macro avg       0.71      0.69      0.70    439043
weighted avg       0.77      0.78      0.77    439043



In [4]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[ 16662   9939   3346   7484    875]
 [  4992  47781   3040   2481   9204]
 [  1440   1863 125392   1830  12522]
 [  6230   4939   3861  21450    577]
 [   588   7105  15706    407 129329]]
