### <b>IMDB Decision Tree Classifier</b> by [codyh587](https://github.com/codyh587)

#### Data Setup and Vectorization

In [15]:
import gc

import matplotlib.pyplot as plt
import numpy as np

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

seed = 1234
np.random.seed(seed)

In [16]:
# for low memory VMs, rerunning this may require restarting the kernel
def load_imdb_dataset():
    gc.collect()

    dataset = load_dataset("imdb").shuffle(seed=seed)
    vectorizer = TfidfVectorizer().fit(dataset["train"]["text"])

    train_texts = vectorizer.transform(dataset["train"]["text"])
    train_labels = np.array(dataset["train"]["label"])

    test_texts = vectorizer.transform(dataset["test"]["text"])
    test_labels = np.array(dataset["test"]["label"])

    del dataset, vectorizer
    gc.collect()

    train_texts, valid_texts, train_labels, valid_labels = train_test_split(
        train_texts,
        train_labels,
        test_size=0.25,
        random_state=seed,
        stratify=train_labels
    )

    return (
        train_texts,
        train_labels,
        valid_texts,
        valid_labels,
        test_texts,
        test_labels
    )

In [17]:
# labels: 0 = negative review, 1 = positive review
train_X, train_y, valid_X, valid_y, test_X, test_y = load_imdb_dataset()

In [None]:
print(f"Training texts shape: {train_X.shape}")
print(f"Training labels shape: {train_y.shape}")
print(f"Valid texts shape: {valid_X.shape}")
print(f"Valid labels shape: {valid_y.shape}")
print(f"Testing texts shape: {test_X.shape}")
print(f"Testing labels shape: {test_y.shape}")

Train texts shape: (18750, 74849)
Train labels shape: (18750,)
Validation texts shape: (6250, 74849)
Validation labels shape: (6250,)
Test texts shape: (25000, 74849)
Test labels shape: (25000,)


: 

#### Finding Optimal Depth

In [None]:
depths = [i for i in range(1, 30)]
train_errors = []
test_errors = []

for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth, random_state=seed)
    clf.fit(train_X, train_y)

    train_pred = clf.predict(train_X)
    test_pred = clf.predict(test_X)

    train_errors.append(1 - accuracy_score(train_y, train_pred))
    test_errors.append(1 - accuracy_score(test_y, test_pred))

In [None]:

plt.plot(depths, train_errors, label='Training Error')
plt.plot(depths, test_errors, label='Testing Error')
plt.title('Decision Tree Error vs. Maximum Tree Depth')
plt.xlabel('Maximum Tree Depth')
plt.ylabel('Error Rate')
plt.legend()
plt.show()