In [24]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scikeras.wrappers import KerasClassifier
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

In [9]:
# read data
df = pd.read_csv('/content/riceClassification.csv')
# check for missing values - train
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               18185 non-null  int64  
 1   Area             18185 non-null  int64  
 2   MajorAxisLength  18185 non-null  float64
 3   MinorAxisLength  18185 non-null  float64
 4   Eccentricity     18185 non-null  float64
 5   ConvexArea       18185 non-null  int64  
 6   EquivDiameter    18185 non-null  float64
 7   Extent           18185 non-null  float64
 8   Perimeter        18185 non-null  float64
 9   Roundness        18185 non-null  float64
 10  AspectRation     18185 non-null  float64
 11  Class            18185 non-null  int64  
dtypes: float64(8), int64(4)
memory usage: 1.7 MB


In [10]:
# first, delete the id column
df.drop(['id'], axis=1, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             18185 non-null  int64  
 1   MajorAxisLength  18185 non-null  float64
 2   MinorAxisLength  18185 non-null  float64
 3   Eccentricity     18185 non-null  float64
 4   ConvexArea       18185 non-null  int64  
 5   EquivDiameter    18185 non-null  float64
 6   Extent           18185 non-null  float64
 7   Perimeter        18185 non-null  float64
 8   Roundness        18185 non-null  float64
 9   AspectRation     18185 non-null  float64
 10  Class            18185 non-null  int64  
dtypes: float64(8), int64(3)
memory usage: 1.5 MB


In [12]:
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.76451,1.440796,1
1,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,1
2,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.46595,1
3,3073,77.033628,51.928487,0.738639,3157,62.5513,0.783529,210.657,0.870203,1.483456,1
4,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.51,1


In [13]:
# drop unwanted columns
df.drop(['Area', 'ConvexArea'], axis=1, inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MajorAxisLength  18185 non-null  float64
 1   MinorAxisLength  18185 non-null  float64
 2   Eccentricity     18185 non-null  float64
 3   EquivDiameter    18185 non-null  float64
 4   Extent           18185 non-null  float64
 5   Perimeter        18185 non-null  float64
 6   Roundness        18185 non-null  float64
 7   AspectRation     18185 non-null  float64
 8   Class            18185 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [16]:

# split into X and Y
Y = df['Class']
X = df.drop(['Class'], axis=1)

print(X.shape)
print(Y.shape)

# convert to numpy arrays
X = np.array(X)

(18185, 8)
(18185,)


In [17]:
# build the model
model = Sequential()
model.add(Dense(16, input_shape=(X.shape[1],), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

# compile the model
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
# early stopping callback
# This callback will stop the training when there is no improvement in
# the validation loss for 10 consecutive epochs.
es = EarlyStopping(monitor='val_accuracy',
                                   mode='max',
                                   patience=10,
                                   restore_best_weights=True)

# now we just update our model fit call
history = model.fit(X,
                    Y,
                    callbacks=[es],
                    epochs=20,
                    batch_size=8,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8938 - loss: 0.3237 - val_accuracy: 0.9203 - val_loss: 0.2524
Epoch 2/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9725 - loss: 0.1071 - val_accuracy: 0.9057 - val_loss: 0.2867
Epoch 3/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9759 - loss: 0.0792 - val_accuracy: 0.4069 - val_loss: 1.6552
Epoch 4/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9780 - loss: 0.0636 - val_accuracy: 0.3492 - val_loss: 2.2171
Epoch 5/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9825 - loss: 0.0508 - val_accuracy: 0.7814 - val_loss: 0.8862
Epoch 6/20
[1m1819/1819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9814 - loss: 0.0487 - val_accuracy: 0.5724 - val_loss: 1.8513
Epoch 7/20
[1m1

In [23]:

history_dict = history.history
# Learning curve/Loss)
# the training and validation loss by epoch

# loss
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(loss_values) + 1)


In [22]:

# Learning curve/accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(1, len(acc) + 1)

# this is the max value should correspond to the highest train accuracy
# the HIGHEST train accuracy
np.max(val_acc)

np.float64(0.920263946056366)

In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


model.predict(X) #type (Jasmine or Gonen)
np.round(model.predict(X),0)
Y

preds = np.round(model.predict(X),0)

# confusion matrix
print(confusion_matrix(Y, preds)) (actual, predicted)


print(classification_report(Y, preds))

[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m569/569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[[7910  290]
 [ 481 9504]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      8200
           1       0.97      0.95      0.96      9985

    accuracy                           0.96     18185
   macro avg       0.96      0.96      0.96     18185
weighted avg       0.96      0.96      0.96     18185

