In [37]:
import pandas as pd
import numpy as np #import numpy to enable array functionality
from sklearn.pipeline import Pipeline #let's import the pipeline functionality
from sklearn.feature_extraction.text import CountVectorizer #and we will import a simple pre-processing method
from sklearn.feature_extraction.text import TfidfTransformer #and a representation learner
from sklearn.neighbors import KNeighborsClassifier #and a simple classifier model
from sklearn.model_selection import StratifiedKFold #cross fold is sometimes called k-fold. Calling the stratified version ensures that classes have equal representation across folds
from sklearn.metrics import accuracy_score #import an accuracy metric to tell us how well the model is doing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lapos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lapos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df_train = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet")
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/test-00000-of-00001.parquet")
df_unsupervised = pd.read_parquet("hf://datasets/stanfordnlp/imdb/plain_text/unsupervised-00000-of-00001.parquet")

### Explore the data

In [27]:
df_train.shape

(25000, 2)

In [28]:
df_test.shape

(25000, 2)

In [12]:
df_train.columns

Index(['text', 'label'], dtype='object')

In [14]:
df_test.columns

Index(['text', 'label'], dtype='object')

In [17]:
df_train['label'].unique()

array([0, 1])

In [19]:
df_train['text'].sample(10)

15504    One of the better Vance films succeeds more on...
15789    The Finnish version of Robert Altman's "Short ...
587      The producers made a big mistake casting Mark ...
16468    Now my friends, films like "La BÃªte" (aka "The...
20102    Kurt Russell IS Elvis, plain and simple. His d...
7048     I thought that I was never going to find a hor...
1583     STAR RATING: ***** Saturday Night **** Friday ...
16382    Undoubtedly one of the great John Ford's maste...
15755    Coming from the same director who'd done "Cand...
9831     I honestly can't believe what passes for enter...
Name: text, dtype: object

### Create functions to preprocess the text

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin




class pre_process(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        prep_sentences = []

        for text in X:
            # Remove HTML tags
            text = re.sub(r'<.*?>', '', text)

            # Tokenize
            tokens = word_tokenize(text)

            # Remove punctuation, lowercase, remove stopwords, and lemmatize in one pass
            processed = [
                self.lemmatizer.lemmatize(token.lower())
                for token in tokens
                if token.isalpha() and token.lower() not in self.stop_words
            ]

            # Join back to string
            prep_sentences.append(" ".join(processed))

        return prep_sentences


In [36]:
# Apply preprocessing to a sample
preprocessor = pre_process()
sample_data = df_train['text'].sample(10)
processed_sample = preprocessor.transform(sample_data)

# Display results
for i, (original, processed) in enumerate(zip(sample_data, processed_sample)):
    print(f"--- Sample {i+1} ---")
    print(f"Original: {original[:200]}...")  # First 200 chars
    print(f"Processed: {processed[:200]}...")  # First 200 chars
    print()

--- Sample 1 ---

--- Sample 2 ---
Original: I love Seth Green. His appearances on THat 70s' Show is always worth watching but last night, I felt the show needed to overhauled. Four single young guys inherit a New York City apartment that most o...
Processed: love seth green appearance show always worth watching last night felt show needed overhauled four single young guy inherit new york city apartment u would die grandmother must heiress space first plac...

--- Sample 3 ---
Original: I work as a hotel concierge in Washington DC and take my word, there was nothing remotely accurate about the character played by Michael J. Fox- # 1 we simply do not walk around with our pockets burst...
Processed: work hotel concierge washington dc take word nothing remotely accurate character played michael simply walk around pocket bursting theater ticket bill ever let anybody use room delight time fired spot...

--- Sample 4 ---
Original: <br /><br />One would expect a movie with a famous comedian i

In [40]:
text_clf = Pipeline([
  ('prep', pre_process()),
  ('count', CountVectorizer(max_features=300)),
  ('rep', TfidfTransformer()),
  ('mod', MultinomialNB()),
  ])
# X_train: your training features
# y_train: your training labels (target variable)
# X_test, y_test: your separate test set (not used in cross-validation)
X_train = df_train['text']
y_train = df_train['label']
X_test = df_test['text']
y_test = df_test['label']
acc_score = []


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Split the TRAINING data only
for train_idx, val_idx in kf.split(X_train, y_train):
    # Split training data into train and validation folds
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    text_clf.fit(X_train, y_train) #we then only fit the training data (note that we oapply the text_clf pipeline object, rather than having to go through each function separately)
    predictions = text_clf.predict(X_test) #and can predict on the test data (similar to above, we can predict using the pipeline directly)
    acc = accuracy_score(predictions, y_test) #we use the accuracy score we imported to give an idea how well the model is doing
    acc_score.append(acc) #we can append it to our list
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    print(classification_report(y_test, predictions))

print("Accuracy:", np.mean(acc_score)) #we can take the mean to get a good overview


              precision    recall  f1-score   support

           0       0.81      0.79      0.80     12500
           1       0.79      0.81      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     12500
           1       0.79      0.81      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

              precision    recall  f1-score   support

           0       0.81      0.79      0.80     12500
           1       0.79      0.81      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

              preci