In [22]:
# import required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression


# Import seeds dataset
df = pd.read_table('datasets/seeds_dataset.txt', sep="\t")

# set the column names
df.columns = ["area", "parimeter", "compactness", "length_of_kernal", "width_of_kernal", "asymmetry", 
                "length_of_kernel_groove", "type"]

# check the number of nan values in each column
df.isnull().sum(axis = 0)

# compactness contains one nan value
df['compactness'] = df['compactness'].fillna(0)

# set the input variables
x = df[['area', 'length_of_kernal','width_of_kernal', 'asymmetry', 'length_of_kernel_groove']]

# target value
y = df['type']

# create train, test and split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=.5, random_state=5)

In [23]:
print(x.shape)

(209, 5)


## Logistic regression

In [24]:
# build corralation matrix to check the corralation of each of the inputs
corr_matrix = df.corr()

#print our correlation matrix
print(corr_matrix["type"].sort_values(ascending = False))


# Create logistic regression model and fit X_train and Y_train
model = LogisticRegression(max_iter=100000)
model.fit(X_train, Y_train)

# predict target value
Y_Pred = model.predict(X_test)

# Root Mean Squared Error and r2 score for the model
rmse = np.sqrt(mean_squared_error(Y_test,Y_Pred))
r2 = r2_score(Y_test,Y_Pred)
print()
print ('rmse = ', rmse)
print ('r2 ', r2)

# print the accuracy of our model
score = model.score(X_test, Y_test)
print("accuracy = ", score)

# predict the type with the values for area, length_of_kernal, width_of_kernal, asymmetry, length_of_kernel_groove
predictedType = model.predict([[19.3, 6.3, 3.8 , 3.4, 6.2]])
predictedType2 = model.predict([[11.4, 5.1, 2.7 , 2.2, 5.2]])


# type of seeds 1 = Kama, 2 = Rosa, 3 = Canadian
print('The predicted type is ', predictedType)
print('The predicted type is ', predictedType2)



type                       1.000000
asymmetry                  0.574895
length_of_kernel_groove    0.022141
compactness               -0.112903
parimeter                 -0.129415
length_of_kernal          -0.256465
area                      -0.346487
width_of_kernal           -0.424178
Name: type, dtype: float64

rmse =  0.4879500364742666
r2  0.674962852897474
accuracy =  0.9047619047619048
The predicted type is  [2]
The predicted type is  [3]


## SVM

In [25]:
# import required libraries 
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

# Scale and transform our features between -1 & 1
scaling = MinMaxScaler(feature_range=(-1,1)).fit(x_train)

x_train = scaling.transform(x_train)
x_test = scaling.transform(x_test)

# create our svm model with Regularization, maximum-margin & maximun iteration
svm = LinearSVC(C=100, loss='hinge', max_iter=100000)

# fit the training data to the model
svm.fit(x_train, y_train)

# print the score of our model
print(svm.score(x_test,y_test))

#set the y_pred to our model.predict 
y_pred = svm.predict(x_test)

# create a confusion matrix and pass i y_test and y_pred
cm = pd.DataFrame(confusion_matrix(y_test, y_pred))

# print the confusion matrix
cm


0.9761904761904762


Unnamed: 0,0,1,2
0,14,0,0
1,0,9,0
2,1,0,18


## KNN


In [29]:
# import required libraries
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

# set the target type
y_df = pd.DataFrame(y, columns={"type"})

# set the target values types
y_df.loc[(y_df["type"] == 1), "seed"] = "Kama"
y_df.loc[(y_df["type"] == 2), "seed"] = "Rosa"
y_df.loc[(y_df["type"] == 3), "seed"] = "Canadian"

print(y_df)

# create our KNN model with 7 Neighbors
knn = KNeighborsRegressor(n_neighbors=7)

# fit the training and test data to the model
knn.fit(X_train, Y_train)

# print the score of the KNN model
score = knn.score(X_test, Y_test)
print ("Score", score)



     type      seed
0       1      Kama
1       1      Kama
2       1      Kama
3       1      Kama
4       1      Kama
..    ...       ...
204     3  Canadian
205     3  Canadian
206     3  Canadian
207     3  Canadian
208     3  Canadian

[209 rows x 2 columns]
Score 0.788261515601783


## ensemble

In [27]:
# import required libraries
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor

# create linear, KNN and SVR models
lin_reg = LinearRegression()
knn_reg = KNeighborsRegressor(n_neighbors=5)
svr_reg = SVR(C=10000)

# create our voting regressor
voting_reg = VotingRegressor(estimators=[('lr', lin_reg),('knn', knn_reg), ('svr', svr_reg)])

# for each model fit the training data & compare the results
for reg in (lin_reg, knn_reg, svr_reg, voting_reg):
    reg.fit(X_train, Y_train)
    Y_pred = reg.predict(X_test)
    print(reg.__class__.__name__, r2_score(Y_test, Y_pred))

LinearRegression 0.6794590843495407
KNeighborsRegressor 0.7675334323922733
SVR 0.7944264457279198
VotingRegressor 0.8064516023428128


## Neural Network

In [3]:
# import required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report


# Import seeds dataset
df = pd.read_table('datasets/seeds_dataset.txt', sep="\t")

# set the column names
df.columns = ["area", "parimeter", "compactness", "length_of_kernal", "width_of_kernal", "asymmetry", 
                "length_of_kernel_groove", "type"]

# check the number of nan values in each column
df.isnull().sum(axis = 0)

# compactness contains one nan value
df['compactness'] = df['compactness'].fillna(0)

# set the input variables
x = df[['area', 'length_of_kernal','width_of_kernal', 'asymmetry', 'length_of_kernel_groove']]

# target value
y = df['type']

# create train, test and split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=32)

# Scale and transorm x train and test data
scaling = StandardScaler().fit(x_train)

x_train = scaling.transform(x_train)
x_test = scaling.transform(x_test)

# create neural network model
nn_class = MLPClassifier(max_iter=1000)

# fit training data to the model
nn_class.fit(x_train, y_train)

# set y_pred to model.predict
y_pred = nn_class.predict(x_test)

# print classification report
print(classification_report(y_test, y_pred))


predicted = nn_class.predict([[11.4, 5.1, 2.7 , 2.2, 5.2]])
# type of seeds 1 = Kama, 2 = Rosa, 3 = Canadian
print('The predicted type is ', predicted)

              precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       1.00      1.00      1.00         9
           3       0.95      1.00      0.97        19

    accuracy                           0.98        42
   macro avg       0.98      0.98      0.98        42
weighted avg       0.98      0.98      0.98        42

The predicted type is  [2]
