In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import string

In [None]:
df = pd.read_excel("/content/temizlenmis_veriler.xlsx")

In [None]:
X = df.clean.to_numpy()
y = df.etiket.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Hyperparams: criterion="gini", max_depth=2
clf = DecisionTreeClassifier(criterion="gini", max_depth=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.48

In [None]:
# Hyperparams: criterion="entropy", max_depth=5
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.56

In [None]:
# CV with 5-folds
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

array([0.6   , 0.625 , 0.575 , 0.6625, 0.6375])

In [None]:
print(scores.mean(), scores.std())

0.6199999999999999 0.030207614933986437


In [None]:
# Grid search
parameters = {"criterion": ["entropy", "gini", "log_loss"],
              "max_depth": range(2, 6)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters,
                   cv=5, n_jobs=4, verbose=3)
clf.fit(X_train, y_train)
print("Best score:", clf.best_score_, "Best params:", clf.best_params_)
tree = clf.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.6224999999999999 Best params: {'criterion': 'gini', 'max_depth': 5}


In [None]:
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

0.7275
0.54
