In [224]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [225]:
data = pd.read_csv('fruit_types.csv')
data.head()

Unnamed: 0,fruit_name,fruit_subtype,mass,width,height,color_score
0,apple,granny_smith,192,8.4,7.3,0.55
1,apple,granny_smith,180,8.0,6.8,0.59
2,apple,granny_smith,176,7.4,7.2,0.6
3,mandarin,mandarin,86,6.2,4.7,0.8
4,mandarin,mandarin,84,6.0,4.6,0.79


In [226]:
encoder = LabelEncoder()

fruit_name_encoded = encoder.fit_transform(data.iloc[:,0])
fruit_subtype_encoded = encoder.fit_transform(data.iloc[:,1])

# we are using it on number values because we want a balance in overall dataset
mass_encoded = encoder.fit_transform(data.iloc[:,2])
width_encoded = encoder.fit_transform(data.iloc[:,3])
height_encoded = encoder.fit_transform(data.iloc[:,4])
color_score_encoded = encoder.fit_transform(data.iloc[:,5])

In [227]:
fruit_name_encoded

array([0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [228]:
data = list(zip(fruit_name_encoded, fruit_subtype_encoded, mass_encoded, width_encoded, height_encoded, color_score_encoded))
data

[(0, 3, 30, 20, 8, 0),
 (0, 3, 27, 19, 4, 1),
 (0, 3, 25, 14, 7, 2),
 (2, 4, 3, 4, 3, 15),
 (2, 4, 2, 2, 2, 14),
 (2, 4, 1, 0, 1, 12),
 (2, 4, 1, 1, 1, 16),
 (2, 4, 0, 0, 0, 16),
 (0, 0, 26, 11, 13, 25),
 (0, 0, 23, 14, 5, 24),
 (0, 0, 20, 9, 8, 26),
 (0, 0, 23, 11, 11, 25),
 (0, 0, 14, 10, 6, 23),
 (0, 2, 19, 13, 12, 5),
 (0, 2, 13, 16, 8, 4),
 (0, 2, 15, 17, 6, 4),
 (0, 2, 15, 16, 10, 3),
 (0, 2, 21, 15, 11, 8),
 (0, 1, 18, 15, 6, 18),
 (0, 1, 18, 14, 7, 20),
 (0, 1, 17, 15, 10, 21),
 (0, 1, 15, 14, 9, 19),
 (0, 1, 9, 13, 6, 22),
 (0, 1, 22, 16, 14, 23),
 (3, 7, 37, 21, 22, 10),
 (3, 7, 38, 22, 21, 10),
 (3, 7, 39, 23, 21, 9),
 (3, 5, 34, 15, 21, 12),
 (3, 5, 9, 7, 6, 7),
 (3, 5, 17, 10, 9, 16),
 (3, 5, 16, 11, 10, 14),
 (3, 5, 35, 18, 15, 17),
 (3, 5, 19, 12, 5, 15),
 (3, 8, 29, 15, 16, 9),
 (3, 8, 10, 16, 13, 10),
 (3, 8, 12, 11, 14, 10),
 (3, 8, 17, 11, 11, 11),
 (3, 8, 14, 13, 8, 14),
 (3, 8, 16, 12, 13, 12),
 (3, 8, 11, 8, 9, 10),
 (3, 8, 14, 11, 10, 13),
 (3, 8, 27, 16, 17, 14)

In [229]:
data = pd.DataFrame(data, columns=["fruit_name",	"fruit_subtype",	"mass",	"width",	"height",	"color_score"])

In [230]:
data

Unnamed: 0,fruit_name,fruit_subtype,mass,width,height,color_score
0,0,3,30,20,8,0
1,0,3,27,19,4,1
2,0,3,25,14,7,2
3,2,4,3,4,3,15
4,2,4,2,2,2,14
5,2,4,1,0,1,12
6,2,4,1,1,1,16
7,2,4,0,0,0,16
8,0,0,26,11,13,25
9,0,0,23,14,5,24


In [231]:
data_to_use = data.iloc[:, 1:6]
data_to_target = data.iloc[:, 0]

In [232]:
data_to_target

0     0
1     0
2     0
3     2
4     2
5     2
6     2
7     2
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    3
25    3
26    3
27    3
28    3
29    3
30    3
31    3
32    3
33    3
34    3
35    3
36    3
37    3
38    3
39    3
40    3
41    3
42    3
43    1
44    1
45    1
46    1
47    1
48    1
49    1
50    1
51    1
52    1
53    1
54    1
55    1
56    1
57    1
58    1
Name: fruit_name, dtype: int64

In [233]:
X_train, X_test, y_train, y_test = train_test_split(data_to_use, data_to_target, test_size=0.30, random_state=32)

In [234]:
X_test

Unnamed: 0,fruit_subtype,mass,width,height,color_score
44,6,33,13,27,7
12,0,14,10,6,23
58,9,5,3,16,5
51,9,4,2,10,7
21,1,15,14,9,19
52,9,5,1,15,7
53,9,6,2,18,9
48,6,24,13,24,7
28,5,9,7,6,7
22,1,9,13,6,22


In [235]:
knn_model = KNeighborsClassifier(n_neighbors=5)
#at this point it just saves the data and do no training
knn_model.fit(X_train, y_train)

In [236]:
y_test

44    1
12    0
58    1
51    1
21    0
52    1
53    1
48    1
28    3
22    0
27    3
31    3
8     0
2     0
41    3
57    1
49    1
40    3
Name: fruit_name, dtype: int64

In [237]:
y_pred_knn = knn_model.predict(X_test)
y_pred_knn

array([1, 0, 1, 1, 0, 1, 1, 1, 3, 0, 1, 3, 0, 0, 1, 1, 1, 3])

In [238]:
print('Accuracy', metrics.accuracy_score(y_test, y_pred_knn))

Accuracy 0.8888888888888888


In [239]:
from math import sqrt

In [240]:
print("RMSE", sqrt(metrics.mean_squared_error(y_test, y_pred_knn)))

RMSE 0.6666666666666666


In [None]:
## gini