<a href="https://colab.research.google.com/github/couragedike1/Natural_Language_Processing-/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gensim
!pip install nltk scikit-learn numpy



In [2]:
# Import required libraries
import nltk
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Step 1: Prepare a small synthetic dataset
texts = [
    "This movie is absolutely fantastic and I loved it",
    "Great film with amazing acting and story",
    "Terrible movie, very boring and poorly made",
    "I hated this film, it was awful",
    "Wonderful movie, really enjoyed the plot"
]
labels = [1, 1, 0, 0, 1]  # 1 = positive, 0 = negative

In [5]:
# Step 2: Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

processed_texts = [preprocess_text(text) for text in texts]

In [6]:
# Step 3: Train a Word2Vec model
try:
    word2vec_model = Word2Vec(sentences=processed_texts, vector_size=100, window=5, min_count=1, workers=4)
except Exception as e:
    print(f"Error training Word2Vec: {e}")
    exit()

In [7]:
# Step 4: Create document embeddings
def get_document_embedding(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X = np.array([get_document_embedding(text, word2vec_model) for text in processed_texts])

In [8]:
# Step 5: Train a classifier
y = np.array(labels)
clf = LogisticRegression()
clf.fit(X, y)

In [9]:
# Step 6: Test on a new example
test_text = "This movie was really great and enjoyable"
test_tokens = preprocess_text(test_text)
test_embedding = get_document_embedding(test_tokens, word2vec_model)
prediction = clf.predict([test_embedding])[0]
print(f"Test text: {test_text}")
print(f"Predicted sentiment: {'Positive' if prediction == 1 else 'Negative'}")

Test text: This movie was really great and enjoyable
Predicted sentiment: Positive


In [10]:
# Step 7: Evaluate
y_pred = clf.predict(X)
print(f"Accuracy: {accuracy_score(y, y_pred):.2f}")
print(f"F1-Score: {f1_score(y, y_pred):.2f}")

Accuracy: 0.60
F1-Score: 0.75


In [11]:
# Step 8: Explore word similarities
print("\nSimilar words to 'great':")
try:
    similar_words = word2vec_model.wv.most_similar('great', topn=3)
    for word, score in similar_words:
        print(f"{word}: {score:.2f}")
except KeyError:
    print("Word 'great' not in vocabulary due to small dataset.")


Similar words to 'great':
wonderful: 0.16
story: 0.15
terrible: 0.07
