In [1]:
import pandas as pd      
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from scipy.stats import skew

from sklearn.model_selection import cross_validate, cross_val_score
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500)

In [2]:
df = pd.read_excel("Raisin_Dataset.xlsx")

In [3]:
df.groupby("Class").describe().T

Unnamed: 0,Class,Besni,Kecimen
Area,count,450.0,450.0
Area,mean,112194.788889,63413.466667
Area,std,39229.897872,17727.768795
Area,min,40702.0,25387.0
Area,25%,83598.5,50466.25
Area,50%,104426.5,61420.0
Area,75%,135683.25,75153.0
Area,max,235047.0,180898.0
MajorAxisLength,count,450.0,450.0
MajorAxisLength,mean,509.000652,352.859249


In [4]:
df["class_bin"] = np.where(df.Class == "Kecimen", 0, 1)
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class,class_bin
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen,0


In [5]:
df = df.drop("Class", axis = 1); df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,class_bin
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,0
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,0
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,0
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,0
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,0


In [6]:
X= df.drop(columns="class_bin")
y= df.class_bin
print(X,y)

      Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0    87524       442.246011       253.291155      0.819738       90546   
1    75166       406.690687       243.032436      0.801805       78789   
2    90856       442.267048       266.328318      0.798354       93717   
3    45928       286.540559       208.760042      0.684989       47336   
4    79408       352.190770       290.827533      0.564011       81463   
..     ...              ...              ...           ...         ...   
895  83248       430.077308       247.838695      0.817263       85839   
896  87350       440.735698       259.293149      0.808629       90899   
897  99657       431.706981       298.837323      0.721684      106264   
898  93523       476.344094       254.176054      0.845739       97653   
899  85609       512.081774       215.271976      0.907345       89197   

       Extent  Perimeter  
0    0.758651   1184.040  
1    0.684130   1121.786  
2    0.637613   1208.575  
3  

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [8]:
scaler = StandardScaler()

In [9]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
log_model = LogisticRegression()

In [12]:
log_model.fit(X_train_scaled, y_train)

In [13]:
y_pred = log_model.predict(X_test_scaled)

In [14]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

In [15]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data["pred"] = y_pred
test_data["pred_proba"] = y_pred_proba[:,1]#1 olma olasılığı
test_data.sample(10)

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,class_bin,pred,pred_proba
227,65469,374.466231,227.538997,0.794216,68018,0.714438,1020.029,0,0,0.228485
753,79057,436.390047,236.895393,0.839828,82642,0.654597,1148.146,1,1,0.627172
454,61409,403.701295,209.365889,0.855007,67286,0.597393,1083.477,1,0,0.485899
629,83059,440.559971,243.636512,0.833171,88806,0.685809,1238.163,1,1,0.780822
333,60313,339.603456,227.725924,0.741851,62130,0.72668,946.487,0,0,0.094078
89,49059,301.800398,210.467215,0.716709,50835,0.698607,867.582,0,0,0.040899
319,57303,330.024964,221.919786,0.740158,58973,0.716762,919.044,0,0,0.074011
620,78984,427.494276,236.882575,0.832438,82328,0.698417,1169.476,1,1,0.640883
871,105091,507.685116,268.087343,0.849208,108296,0.700271,1295.591,1,1,0.900593
299,36113,257.641853,181.157079,0.711056,37442,0.694481,737.637,0,0,0.012288


In [16]:
from sklearn.metrics import confusion_matrix, classification_report

In [17]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [18]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

Test_Set
[[127  13]
 [ 24 106]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       140
           1       0.89      0.82      0.85       130

    accuracy                           0.86       270
   macro avg       0.87      0.86      0.86       270
weighted avg       0.86      0.86      0.86       270


Train_Set
[[275  35]
 [ 47 273]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       310
           1       0.89      0.85      0.87       320

    accuracy                           0.87       630
   macro avg       0.87      0.87      0.87       630
weighted avg       0.87      0.87      0.87       630



In [19]:
from sklearn.model_selection import cross_validate

In [20]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['precision','recall','f1','accuracy'], cv = 5)
df_scores = pd.DataFrame(scores, index = range(1, 6))
df_scores

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_accuracy
1,0.007978,0.003997,0.870968,0.84375,0.857143,0.857143
2,0.003998,0.003998,0.918033,0.875,0.896,0.896825
3,0.00599,0.002991,0.866667,0.8125,0.83871,0.84127
4,0.004997,0.004001,0.885246,0.84375,0.864,0.865079
5,0.017004,0.007995,0.887097,0.859375,0.873016,0.873016


In [21]:
df_scores.mean()[2:]

test_precision    0.885602
test_recall       0.846875
test_f1           0.865774
test_accuracy     0.866667
dtype: float64

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
#GridSearchCV ile recall iyileştirmek için parametreleri ve değerleri ayarlayıp  1 skorunun değerini iyileştirip hastalaı yakalama oranını artırmak. 

model = LogisticRegression()

penalty = ["l1", "l2", "elasticnet"]
C = np.logspace(-3,3,7) # katsayı için logspace alınarak değerler verilmesi öneriliyor


param_grid = {"penalty" : penalty,
              "C" : C
             }


grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=10,
                          scoring = "recall", # 1 class ait en iyi skoru döndürecek, tek score verebiliriz
                          n_jobs = -1)

In [24]:
grid_model.fit(X_train_scaled, y_train)

In [25]:
grid_model.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [26]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

Test_Set
[[127  13]
 [ 24 106]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       140
           1       0.89      0.82      0.85       130

    accuracy                           0.86       270
   macro avg       0.87      0.86      0.86       270
weighted avg       0.86      0.86      0.86       270


Train_Set
[[275  35]
 [ 47 273]]
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       310
           1       0.89      0.85      0.87       320

    accuracy                           0.87       630
   macro avg       0.87      0.87      0.87       630
weighted avg       0.87      0.87      0.87       630

