In [28]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re
import numpy as np
from sklearn.linear_model import LogisticRegression


In [29]:
nltk.download("punkt")  # Ensure tokenization works

# Manually labeled dataset (training data)
labeled_data = pd.DataFrame({
    "text": [
        "I love this product, it's amazing!",
        "Absolutely terrible, I hate it.",
        "The food was okay, but nothing special.",
        "This is the worst movie I have ever seen!",
        "Great customer service, very friendly staff."
    ],
    "label": ["Positive", "Negative", "Neutral", "Negative", "Positive"]
})
print(labeled_data)

                                           text     label
0            I love this product, it's amazing!  Positive
1               Absolutely terrible, I hate it.  Negative
2       The food was okay, but nothing special.   Neutral
3     This is the worst movie I have ever seen!  Negative
4  Great customer service, very friendly staff.  Positive


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Erenay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
tokenized_sentences  = [word_tokenize(re.sub(r'[^\w\s]', '', text.lower())) for text in labeled_data['text']]
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_labeling.model")
print(word2vec_model.wv["love"])

[-0.00713846  0.0012435  -0.00717887 -0.00224578  0.00371868  0.00583313
  0.00119666  0.00210257 -0.00411176  0.00722634 -0.00630901  0.00465065
 -0.00822265  0.00203721 -0.00497833 -0.00424626 -0.00310521  0.00565903
  0.00579819 -0.00497913  0.00077368 -0.00849669  0.00781208  0.00925818
 -0.00274128  0.00080249  0.00074291  0.00547578 -0.00860662  0.0005866
  0.00687291  0.00223417  0.00112475 -0.00932484  0.00848336 -0.00626395
 -0.00299447  0.00349414 -0.00077323  0.00140776  0.00178264 -0.00683217
 -0.00972261  0.00904076  0.00619794 -0.00691516  0.00340173  0.00020311
  0.00475436 -0.00712241  0.00402688  0.00434602  0.00995426 -0.00447361
 -0.00139072 -0.00731981 -0.00969978 -0.00908355 -0.00102616 -0.0065041
  0.00485221 -0.00616474  0.00252195  0.00074225 -0.00339018 -0.00097752
  0.00998255  0.00914521 -0.00446395  0.00908185 -0.0056444   0.00593039
 -0.00310098  0.00343292  0.00302053  0.00690109 -0.00237537  0.00877899
  0.00759061 -0.00954758 -0.0080064  -0.0076394   0.0

In [31]:
def sentence_to_vec(sentence, model):
    words = word_tokenize(re.sub(r'[^\w\s]', '', sentence.lower()))
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Return zero vector if no words are in vocabulary
    
    return np.mean(word_vectors, axis=0)  # Average word vectors to get sentence vector


In [32]:
# Convert labeled data to vector format
X_train = np.array([sentence_to_vec(text, word2vec_model) for text in labeled_data["text"]])
y_train = labeled_data["label"]

print(X_train.shape)  # Check vector dimensions

(5, 100)


In [33]:


# Train logistic regression model on Word2Vec embeddings
model = LogisticRegression()
model.fit(X_train, y_train)

print("Weak model trained successfully!")


Weak model trained successfully!


In [34]:
# Unlabeled dataset
unlabeled_data = pd.DataFrame({
    "text": [
        "The product is good but expensive and so terrible",
        "Terrible experience, I will never come back!",
        "It was fine, nothing too impressive.",
        "Best purchase I've made in a long time.",
        "Not bad, but could be better."
    ]
})

# Convert unlabeled data into Word2Vec features
X_unlabeled = np.array([sentence_to_vec(text, word2vec_model) for text in unlabeled_data["text"]])

# Predict labels using the trained model
unlabeled_data["predicted_label"] = model.predict(X_unlabeled)

print(unlabeled_data)

                                                text predicted_label
0  The product is good but expensive and so terrible        Negative
1       Terrible experience, I will never come back!        Negative
2               It was fine, nothing too impressive.        Negative
3            Best purchase I've made in a long time.        Positive
4                      Not bad, but could be better.        Positive
