### Checking and cleaning of the dataset

In [74]:
import pandas as pd
import numpy as np

In [75]:
df = pd.read_csv('Raisin_Dataset.csv')

In [76]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [77]:
df['Class'] = df['Class'].map({'Kecimen': 0, 'Besni': 1})

In [78]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [79]:
df.shape

(900, 8)

In [80]:
df.columns

Index(['Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity',
       'ConvexArea', 'Extent', 'Perimeter', 'Class'],
      dtype='object')

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity     900 non-null    float64
 4   ConvexArea       900 non-null    int64  
 5   Extent           900 non-null    float64
 6   Perimeter        900 non-null    float64
 7   Class            900 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 56.4 KB


In [82]:
df.isnull().sum()

Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64

### Separating feature and target variables and create new DataFrames each

In [83]:
x = df[['Area','MajorAxisLength','MinorAxisLength','Eccentricity','ConvexArea','Extent','Perimeter']]
y = df['Class']

In [127]:
clf_scores = []

### Self defined functions

In [84]:
def percentage(ave):
    per = ave * 100
    return round(per, 2)

In [85]:
from sklearn.model_selection import cross_val_score
def get_scores(classifier, scoring_method):
    ave = cross_val_score(classifier, x, y, cv=10, scoring=scoring_method).mean()
    return percentage(ave)

### Decision Tree

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import recall_score, make_scorer

In [115]:
dtree = DecisionTreeClassifier()

In [128]:
accuracy = get_scores(dtree, 'accuracy')
recall = get_scores(dtree, 'recall')
precision = get_scores(dtree, 'precision')
f1 = get_scores(dtree, 'f1')

specificity = make_scorer(recall_score, pos_label=0)
specificity_score = get_scores(dtree, specificity)

sensitivity = make_scorer(recall_score, pos_label=1)
sensitivity_score = get_scores(dtree, sensitivity)

clf_scores.append(['DT', accuracy, recall, precision, f1, specificity_score, sensitivity_score])
print(clf_scores)

[['DT', 81.33, 80.89, 80.86, 80.29, 82.0, 80.22]]


### Random Forest

In [129]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=1)

In [130]:
accuracy = get_scores(rf, 'accuracy')
recall = get_scores(rf, 'recall')
precision = get_scores(rf, 'precision')
f1 = get_scores(rf, 'f1')

specificity = make_scorer(recall_score, pos_label=0)
specificity_score = get_scores(rf, specificity)

sensitivity = make_scorer(recall_score, pos_label=1)
sensitivity_score = get_scores(rf, sensitivity)

clf_scores.append(['RF', accuracy, recall, precision, f1, specificity_score, sensitivity_score])
print(clf_scores)

[['DT', 81.33, 80.89, 80.86, 80.29, 82.0, 80.22], ['RF', 86.67, 84.44, 88.53, 86.25, 88.89, 84.44]]


### K-nearest Neighbor (kNN)

In [100]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)

In [101]:
ave = cross_val_score(knn, x, y, cv=10, scoring='accuracy').mean()
print(percentage(ave))

83.44


In [102]:
ave = cross_val_score(knn, x, y, cv=10, scoring='recall').mean()
print(percentage(ave))

76.0


In [103]:
ave = cross_val_score(knn, x, y, cv=10, scoring='precision').mean()
print(percentage(ave))

89.47


In [104]:
ave = cross_val_score(knn, x, y, cv=10, scoring='f1').mean()
print(percentage(ave))

82.07


In [105]:
specificity = make_scorer(recall_score, pos_label=0)
ave = cross_val_score(knn, x, y, cv=10, scoring=specificity).mean()
print(percentage(ave))

90.89


In [106]:
sensitivity = make_scorer(recall_score, pos_label=1)
ave = cross_val_score(knn, x, y, cv=10, scoring=sensitivity).mean()
print(percentage(ave))

76.0


### Regularized Logistic Regression

In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

lr2 = LogisticRegression(C=0.01)

In [108]:
scaler = StandardScaler()

pipeline = Pipeline([('transformer', scaler), ('estimator', lr2)])

accuracy = get_scores(pipeline, 'accuracy')
print(accuracy)

86.56


In [109]:
recall = get_scores(pipeline, 'recall')
print(recall)

83.33


In [110]:
precision = get_scores(pipeline, 'precision')
print(precision)

89.14


In [111]:
f1 = get_scores(pipeline, 'f1')
print(f1)

86.02


In [112]:
specificity = make_scorer(recall_score, pos_label=0)
specificity_score = get_scores(pipeline, specificity)
print(specificity_score)

89.78


In [113]:
sensitivity = make_scorer(recall_score, pos_label=1)
sensitivity_score = get_scores(pipeline, sensitivity)
print(sensitivity_score)

83.33


### Support Vector Machine

In [34]:
from sklearn import svm

sv_clf = svm.SVC(kernel='linear', C=1, random_state=1)

In [35]:
ave = cross_val_score(sv_clf, x, y, cv=10, scoring='accuracy').mean()
print(percentage(ave))

85.67


In [36]:
ave = cross_val_score(sv_clf, x, y, cv=10, scoring='recall').mean()
print(percentage(ave))

86.22


In [40]:
ave = cross_val_score(sv_clf, x, y, cv=10, scoring='f1').mean()
print(percentage(ave))

85.63


In [41]:
specificity = make_scorer(recall_score, pos_label=0)
ave = cross_val_score(sv_clf, x, y, cv=10, scoring=specificity).mean()
print(percentage(ave))

85.11


In [42]:
specificity = make_scorer(recall_score, pos_label=0)
ave = cross_val_score(sv_clf, x, y, cv=10, scoring=specificity).mean()
print(percentage(ave))

85.11
