In [1]:
# pakages import

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

### DATA LOAD

In [3]:
# data load
df = pd.read_excel('Pumpkin_Seeds_Dataset.xlsx')
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


### DATA PREPROCESSING

In [9]:
print(df.shape)

(2500, 13)


In [7]:
# what kind of features do I have?
print(df.columns)

Index(['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length',
       'Convex_Area', 'Equiv_Diameter', 'Eccentricity', 'Solidity', 'Extent',
       'Roundness', 'Aspect_Ration', 'Compactness', 'Class'],
      dtype='object')


In [10]:
# nan
df.dropna(inplace = True)

In [12]:
print(df['Class'])

0          Çerçevelik
1          Çerçevelik
2          Çerçevelik
3          Çerçevelik
4          Çerçevelik
            ...      
2495    Ürgüp Sivrisi
2496    Ürgüp Sivrisi
2497    Ürgüp Sivrisi
2498    Ürgüp Sivrisi
2499    Ürgüp Sivrisi
Name: Class, Length: 2500, dtype: object


In [13]:
# feature and label
X = df.drop('Class', axis = 1)
y = df['Class']

In [19]:
print(X.head())

    Area  Perimeter  Major_Axis_Length  Minor_Axis_Length  Convex_Area  \
0  56276    888.242           326.1485           220.2388        56831   
1  76631   1068.146           417.1932           234.2289        77280   
2  71623   1082.987           435.8328           211.0457        72663   
3  66458    992.051           381.5638           222.5322        67118   
4  66107    998.146           383.8883           220.4545        67117   

   Equiv_Diameter  Eccentricity  Solidity  Extent  Roundness  Aspect_Ration  \
0        267.6805        0.7376    0.9902  0.7453     0.8963         1.4809   
1        312.3614        0.8275    0.9916  0.7151     0.8440         1.7811   
2        301.9822        0.8749    0.9857  0.7400     0.7674         2.0651   
3        290.8899        0.8123    0.9902  0.7396     0.8486         1.7146   
4        290.1207        0.8187    0.9850  0.6752     0.8338         1.7413   

   Compactness  
0       0.8207  
1       0.7487  
2       0.6929  
3       0.76

In [15]:
print(y.head())

0    Çerçevelik
1    Çerçevelik
2    Çerçevelik
3    Çerçevelik
4    Çerçevelik
Name: Class, dtype: object


In [20]:
print(X.shape)
print(y.shape)

(2500, 12)
(2500,)


### Train/Test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1002, stratify = y)

In [23]:
# train data
print(X_train.shape)
print(y_train.shape)

(2000, 12)
(2000,)


In [24]:
# test data
print(X_test.shape)
print(y_test.shape)

(500, 12)
(500,)


In [26]:
# normalizaton
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
x_test_scaled = scaler.transform(X_test)

### Model selection

In [27]:
model = SVC(kernel = 'rbf', C = 1.0, gamma = 'scale', random_state = 1002)

##### Training 

In [28]:
model.fit(X_train_scaled, y_train)

#### prediction

In [29]:
y_pred = model.predict(x_test_scaled)

### evaluation

In [30]:
acc = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

In [33]:
print("accuracy: ")
print(acc)

accuracy: 
0.892


In [36]:
print('confusion_matrix:')
print(confusion_mat)

confusion_matrix:
[[242  18]
 [ 36 204]]


In [38]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

   Çerçevelik       0.87      0.93      0.90       260
Ürgüp Sivrisi       0.92      0.85      0.88       240

     accuracy                           0.89       500
    macro avg       0.89      0.89      0.89       500
 weighted avg       0.89      0.89      0.89       500

