In [34]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [35]:
df = pd.read_csv('./datasets/breeds_info_clean.csv', quotechar = '"', sep = ",")
df = df.drop(columns = ['breed'])

In [36]:
X = df[df['classification'].isna() == False]
X = X[X['height_low_inches'].isna() == False]
y = X['classification']
#X = X.drop(columns = ['classification'])
X = X.drop(columns = ['classification', 'obey', 'reps_lower', 'reps_upper'])

In [37]:
ss = StandardScaler()

In [38]:
#the subset, by which we should predict classification. besides classification, also no data for the columns 'obey', 'reps_lower', 'reps_upper'
K = df[df['classification'].isna() == True]
K = K.drop(columns = ['classification', 'obey', 'reps_lower', 'reps_upper'])
K = ss.fit_transform(K)

In [39]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [40]:
X_train = ss.fit_transform(X_train)
X_val = ss.fit_transform(X_val)
X_test = ss.fit_transform(X_test)

In [41]:
grid_params = {
    'n_neighbors': [5,10,15],
    'weights': ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose = 1,
    cv = 3,
    n_jobs = -1
    )

gs.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [5, 10, 15],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [42]:
gs.score(X_train, y_train)

1.0

In [43]:
gs.score(X_val, y_val)

0.28

In [44]:
gs.score(X_test, y_test)

0.23076923076923078

In [45]:
gs.best_params_

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}

In [46]:
vc2 = VotingClassifier( 
    [
        ('knn', gs),
        ('lr', LogisticRegression()),
        
    ],
    n_jobs=-1,   
    voting='soft'
)

vc2.fit(X_train, y_train)
vc2.predict(K)

array(['average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'fair working/obedience intelligence',
       'above average working dogs', 'above average working dogs',
       'above average working dogs', 'above average working dogs',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'above average working dogs',
       'average working/obedience intelligence',
       'fair working/obedience intelligence',
       'fair working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
       'average working/obedience intelligence',
     

In [47]:
K_1 = df[df['classification'].isna() == True]
K_1['classification'] = vc2.predict(K)
X_1 = df[df['classification'].isna() == False]
df_1 = pd.concat([K_1,X_1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  K_1['classification'] = vc2.predict(K)


In [48]:
df_1

Unnamed: 0,classification,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs
141,average working/obedience intelligence,,,,27.0,29.0,100.0,150.0
142,average working/obedience intelligence,,,,23.0,28.0,130.0,150.0
143,average working/obedience intelligence,,,,24.0,30.0,100.0,150.0
144,average working/obedience intelligence,,,,24.0,27.0,100.0,120.0
145,average working/obedience intelligence,,,,25.0,29.0,80.0,140.0
...,...,...,...,...,...,...,...,...
136,lowest degree of working/obedience intelligence,0.1,81.0,100.0,26.0,28.0,70.0,100.0
137,lowest degree of working/obedience intelligence,0.1,81.0,100.0,19.0,22.0,45.0,55.0
138,lowest degree of working/obedience intelligence,0.1,81.0,100.0,12.0,16.0,50.0,60.0
139,lowest degree of working/obedience intelligence,0.1,81.0,100.0,17.0,17.0,20.0,22.0


In [49]:
df_1.to_csv('./datasets/breeds_info_clean_classifiers_data.csv', index=False)