In [1]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load the dataset
imdb = load_dataset('imdb')

In [3]:
# Combine train and test split
imdb_combined = concatenate_datasets([imdb['train'], imdb['test']])

In [4]:
# Shuffle the combined IMDB dataset
imdb_combined = imdb_combined.shuffle(seed=9)

In [5]:
# Convert to pandas DataFrame
imdb_df = imdb_combined.to_pandas()

In [6]:
# Display first 5 rows
print(imdb_df.head())

                                                text  label
0  Wow what a great premise for a film : Set it a...      0
1  Why watch this? There is only one reason and t...      0
2  Some people think this was a rather bad TV ser...      1
4  I saw this movie in the theater when it came o...      1


In [7]:
print(imdb_df.shape)

(50000, 2)


In [8]:
# Get labels and features
features = imdb_df['text']
labels = imdb_df['label']

In [9]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=9)

In [10]:
# Initialize a TfidfVectorizer instance
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.75)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

In [11]:
# Create DataFrame using the Tfidf vectorizer and transformed train dataset
print((pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vect.get_feature_names_out())).head())

    00  000  00000000000  00000001  00001  00015  000dm  000s  001  006  ...  \
0  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0  ...   
1  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0  ...   
2  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0  ...   
3  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0  ...   
4  0.0  0.0          0.0       0.0    0.0    0.0    0.0   0.0  0.0  0.0  ...   

   über  übermensch  überwoman  üvegtigris  üzümcü  þorleifsson  þór  \
0   0.0         0.0        0.0         0.0     0.0          0.0  0.0   
1   0.0         0.0        0.0         0.0     0.0          0.0  0.0   
2   0.0         0.0        0.0         0.0     0.0          0.0  0.0   
3   0.0         0.0        0.0         0.0     0.0          0.0  0.0   
4   0.0         0.0        0.0         0.0     0.0          0.0  0.0   

   żmijewski  יגאל  כרמון  
0        0.0   0.0    0.0  
1        0.0   0.0    0.0  
2 

In [12]:
# Initialize a PassiveAggressiveClassifier instance and fit to training set
# pac = PassiveAggressiveClassifier(max_iter=50, random_state=9)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=9)
log_reg.fit(tfidf_train, y_train)

# Predict on the test set and caculate accuracy
y_pred = log_reg.predict(tfidf_test)
acc_score = accuracy_score(y_test, y_pred)
print('Accuracy Score: {:.2f}%'.format(acc_score))

Accuracy Score: 0.89%


In [13]:
print(y_test.values[:10])
print(y_pred[:10])

[0 1 0 1 0 0 1 0 1 0]
[1 1 0 1 0 0 1 0 1 0]
