In [97]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
import pandas as pd

In [61]:
data = pd.read_csv("datasets/processed_dataset.csv")

In [62]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,4,5.0,3.4,1.5,0.2,Iris-setosa


In [63]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,5.0,3.4,1.5,0.2,Iris-setosa


## Label Encoding 

In [64]:
le = LabelEncoder()

In [65]:
target = le.fit_transform(data["Species"])

In [66]:
data["Species"] = target
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,5.1,3.5,1.4,0.2,0
1,1,4.7,3.2,1.6,0.2,0
2,2,4.9,3.1,1.5,0.1,0
3,3,4.4,2.9,1.4,0.2,0
4,4,5.0,3.4,1.5,0.2,0


## Model oluşturmadan önce kontroller

In [67]:
# Model oluşturulmadan veride nan value ve object olmadığını kontrol etmiş olmamız gerekiyor.

In [68]:
data.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [69]:
data.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int32
dtype: object

In [70]:
#Id kolonuna artık ihtiyacımız kalmadı
data.drop("Id", axis=1, inplace=True)

In [71]:
# Her şey iyi gözüküyor
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0


## Eğitim ve test setlerinin ayarlanması

In [72]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2)

In [73]:
display(X_train.shape)
X_test.shape

(123, 4)

(31, 4)

In [74]:
y_train.value_counts()

1    44
2    42
0    37
Name: Species, dtype: int64

In [75]:
y_test.value_counts()

2    12
0    10
1     9
Name: Species, dtype: int64

## Modelin Oluşturulması

In [76]:
xgb_cls = xgb.XGBClassifier(objective="multi:softmax", num_class=3)

In [77]:
xgb_cls.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [78]:
preds = xgb_cls.predict(X_test)

In [79]:
accuracy_score(y_test, preds)

0.967741935483871

In [96]:
confusion_matrix(y_test, preds)

array([[10,  0,  0],
       [ 0,  8,  1],
       [ 0,  0, 12]], dtype=int64)