In [3]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# --- Imports for your class ---
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# --- Download NLTK models (only need to run once) ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# --- 1. Your Pre-processing Class ---
class pre_process(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        prep_sentences = []
        for text in X:
            # Remove HTML tags (good practice)
            text = re.sub(r'<.*?>', '', text)

            # Tokenize
            tokens = word_tokenize(text)

            # Process tokens
            processed = [
                self.lemmatizer.lemmatize(token.lower())
                for token in tokens
                if token.isalpha() and token.lower() not in self.stop_words
            ]

            # Join back to a string, which the next step (Averager) expects
            prep_sentences.append(" ".join(processed))

        return prep_sentences

# --- 2. The Word2VecAverager Class (Unchanged) ---
class Word2VecAverager(BaseEstimator, TransformerMixin):
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model
        self.vector_size = w2v_model.wv.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        avg_vectors = []
        # X is now a list of *pre-processed* strings
        for doc in X:
            doc_vectors = []
            # We .split() the processed string
            for word in doc.split():
                if word in self.w2v_model.wv:
                    doc_vectors.append(self.w2v_model.wv[word])

            if not doc_vectors:
                avg_vectors.append(np.zeros(self.vector_size))
            else:
                avg_vectors.append(np.mean(doc_vectors, axis=0))

        return np.array(avg_vectors)

# --- 3. Sample Data ---
X_train_text = [
    "this movie is fantastic and great",
    "I love this film",
    "what a wonderful performance",
    "this movie is terrible",
    "I hate this film",
    "what a boring and awful movie"
]
y_train = ["positive", "positive", "positive", "negative", "negative", "negative"]

# --- 4. CRITICAL: Train Word2Vec on Processed Text ---

# Instantiate your processor
pre_processor = pre_process()

# Run the raw text through it
processed_X_train = pre_processor.fit_transform(X_train_text)

# Tokenize the *processed* text for Word2Vec
tokenized_processed_train = [doc.split() for doc in processed_X_train]

print("--- Processed Text for W2V Training ---")
print(tokenized_processed_train)
print("-----------------------------------------")

# Train Word2Vec on the *processed* tokens
w2v_model = Word2Vec(sentences=tokenized_processed_train, vector_size=100, window=5, min_count=1, workers=4)

# --- 5. Build and Use the Full Pipeline ---

# Now we build the pipeline. We pass the *objects* themselves.
pipeline = Pipeline([
    ("preprocessor", pre_processor),  # This will process raw text
    ("vectorizer", Word2VecAverager(w2v_model=w2v_model)), # This averages the processed text
    ("classifier", SVC(kernel="linear")) # This classifies the vectors
])

# Train the pipeline on the *original raw text*
# The pipeline handles the processing internally
pipeline.fit(X_train_text, y_train)

# --- 6. Test the Pipeline ---
X_test_text = [
    "what a great film",
    "this film was awful"
]

predictions = pipeline.predict(X_test_text)
print(f"Test Data: {X_test_text}")
print(f"Predictions: {predictions}")

[['this', 'movie', 'is', 'fantastic', 'and', 'great'], ['I', 'love', 'this', 'film'], ['what', 'a', 'wonderful', 'performance'], ['this', 'movie', 'is', 'terrible'], ['I', 'hate', 'this', 'film'], ['what', 'a', 'boring', 'and', 'awful', 'movie']]
[array([-5.3622725e-04,  2.3643136e-04,  5.1033497e-03,  9.0092728e-03,
       -9.3029495e-03, -7.1168090e-03,  6.4588725e-03,  8.9729885e-03,
       -5.0154282e-03, -3.7633716e-03,  7.3805046e-03, -1.5334714e-03,
       -4.5366134e-03,  6.5540518e-03, -4.8601604e-03, -1.8160177e-03,
        2.8765798e-03,  9.9187379e-04, -8.2852151e-03, -9.4488179e-03,
        7.3117660e-03,  5.0702621e-03,  6.7576934e-03,  7.6286553e-04,
        6.3508903e-03, -3.4053659e-03, -9.4640139e-04,  5.7685734e-03,
       -7.5216377e-03, -3.9361035e-03, -7.5115822e-03, -9.3004224e-04,
        9.5381187e-03, -7.3191668e-03, -2.3337686e-03, -1.9377411e-03,
        8.0774371e-03, -5.9308959e-03,  4.5162440e-05, -4.7537340e-03,
       -9.6035507e-03,  5.0072931e-03, -8.