In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import string

In [11]:
df = pd.read_excel("/content/etiketlidata.xlsx")

In [14]:
## Yine aynı hatayı verdiği için; str.lower() , astype('U') değişikliklerini yaptım.
X = df.clean.str.lower().to_numpy()
X = df.clean.astype('U').to_numpy()

df = df.dropna(subset=['clean', 'etiket']) ## Hata için??
X = df.clean.to_numpy()
y = df.etiket.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(input='content')

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [15]:
# Hyperparams: criterion="gini", max_depth=2
clf = DecisionTreeClassifier(criterion="gini", max_depth=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8441967530894112

In [16]:
# Hyperparams: criterion="entropy", max_depth=5
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8500121153380179

In [17]:
# CV with 5-folds
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

array([0.8555421 , 0.85095426, 0.85034838, 0.84883369, 0.85216601])

In [18]:
print(scores.mean(), scores.std())

0.8515688856652106 0.002257949244846465


In [19]:
# Grid search
parameters = {"criterion": ["entropy", "gini", "log_loss"],
              "max_depth": range(2, 6)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters,
                   cv=5, n_jobs=4, verbose=3)
clf.fit(X_train, y_train)
print("Best score:", clf.best_score_, "Best params:", clf.best_params_)
tree = clf.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.8540528896498335 Best params: {'criterion': 'gini', 'max_depth': 5}


In [20]:
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

0.8585968738640495
0.8507390356190938
