In [1]:
# Import necessary libraries
from datasets import load_dataset, concatenate_datasets
import pandas as pd
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the Hugging Face dataset
imdb = load_dataset('imdb')

In [3]:
# Combine train and test splits
imdb_combined = concatenate_datasets([imdb['train'], imdb['test']])

In [4]:
# Shuffle the combined IMDb dataset
imdb_combined = imdb_combined.shuffle(seed=9)

In [5]:
# Convert to pandas DataFrame
imdb_df = imdb_combined.to_pandas()

In [6]:
# Display first 5 rows
print(imdb_df.head())

                                                text  label
0  Wow what a great premise for a film : Set it a...      0
1  Why watch this? There is only one reason and t...      0
2  Some people think this was a rather bad TV ser...      1
4  I saw this movie in the theater when it came o...      1


In [7]:
print(imdb_df.shape)

(50000, 2)


In [8]:
# Get labels and features
features = imdb_df['text']
labels = imdb_df['label']

In [9]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=9)

In [10]:
# Initialize a TfidfVectorizer instance and fit to text data
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.4)

# Fit and transform vectorizer to train set, transform test set
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

# Display DataFrame of transformed train set
print(pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vect.get_feature_names_out()))

        00  000  00000000000  00000001  00001  00015  000dm  000s  001  006  \
0      0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
1      0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
2      0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
3      0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
4      0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
...    ...  ...          ...       ...    ...    ...    ...   ...  ...  ...   
39995  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
39996  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
39997  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
39998  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   
39999  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0   

       ...  über  übermensch  überwoman  üvegtigris

In [11]:
# Initialize a LogisticRegression instance and fit to training set
log_reg = LogisticRegression(random_state=9)
log_reg.fit(tfidf_train, y_train)

# Predict on the test set and caculate accuracy
y_pred = log_reg.predict(tfidf_test)
acc_score = accuracy_score(y_test, y_pred)
print('Accuracy Score: {:.1%}'.format(acc_score))

Accuracy Score: 89.5%


In [12]:
print(y_test.values[:10])
print(y_pred[:10])

[0 1 0 1 0 0 1 0 1 0]
[1 1 0 1 0 0 1 0 1 0]


In [13]:
# Build confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[4365  590]
 [ 461 4584]]


In [14]:
# Build classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4955
           1       0.89      0.91      0.90      5045

    accuracy                           0.89     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.89      0.89     10000



In [15]:
# Lets test it out with a review of our own
text = ['That movie was so bad, not good at all.']
text_vect = tfidf_vect.transform(text)

In [16]:
# Predicts 0 = negative
log_reg.predict(text_vect)

array([0])