In [1]:
####Can we use KNN to predict whether we should play or not?

#Let's import needed libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
from sklearn import preprocessing
import itertools
%matplotlib inline

In [2]:
df = pd.read_csv("C:\\Users\\fabri\\OneDrive\\Documents\\DasText\\csvFiles\\weatherData.csv")
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play?
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,78,False,yes
3,rain,70,96,False,yes
4,rain,68,80,False,yes
5,rain,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rain,75,80,False,yes


In [3]:
df.dtypes

Outlook        object
Temperature     int64
Humidity        int64
Windy            bool
Play?          object
dtype: object

In [4]:
####HOW TO MAP VALUES TO CATEGORICAL VALUES

#Start by making df of just non-numericals you want to map to numerical values
obj_df = df.select_dtypes(include=['object','bool']).copy()
obj_df.head()

Unnamed: 0,Outlook,Windy,Play?
0,sunny,False,no
1,sunny,True,no
2,overcast,False,yes
3,rain,False,yes
4,rain,False,yes


In [5]:
#let's look at the unique values that we are going to map
obj_df['Outlook'].value_counts()

rain        5
sunny       5
overcast    4
Name: Outlook, dtype: int64

In [6]:
#now we map them
cleanup_nums = {"Outlook":     {"sunny": 2, "overcast": 1, "rain": 0},
                "Windy": {True: 1, False: 0},
                "Play?": {"no": 0, "yes": 1}}

In [7]:
#use replace method!
df.replace(cleanup_nums, inplace=True)
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play?
0,2,85,85,0,0
1,2,80,90,1,0
2,1,83,78,0,1
3,0,70,96,0,1
4,0,68,80,0,1


In [8]:
#nice! Let's check the value types again now
df.dtypes

Outlook        int64
Temperature    int64
Humidity       int64
Windy          int64
Play?          int64
dtype: object

In [9]:
#***To use scikit-learn library, we have to convert the Pandas data frame to a Numpy array
X = df[['Outlook','Temperature','Humidity','Windy']].values
print("X data: \n" + str(X[0:5]))
y = df['Play?'].values
print("\nY data: \n"+str(y[0:5]))

X data: 
[[ 2 85 85  0]
 [ 2 80 90  1]
 [ 1 83 78  0]
 [ 0 70 96  0]
 [ 0 68 80  0]]

Y data: 
[0 0 1 1 1]


In [10]:
####Data Standardization give data zero mean and unit variance,
# it is good practice, especially for algorithms such as KNN which is based on distance of cases:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]

array([[ 1.18321596,  1.80471534,  0.49715486, -0.8660254 ],
       [ 1.18321596,  1.01515238,  1.02444033,  1.15470054],
       [ 0.        ,  1.48889015, -0.24104478, -0.8660254 ],
       [-1.18321596, -0.56397354,  1.65718288, -0.8660254 ],
       [-1.18321596, -0.87979873, -0.0301306 , -0.8660254 ]])

In [11]:
##train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (11, 4) (11,)
Test set: (3, 4) (3,)


In [12]:
####NOW WE START THE CLASSIIFICATION
from sklearn.neighbors import KNeighborsClassifier #import this library

####Training
#let k = 4
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [13]:
####Predicting
yhat = neigh.predict(X_test) #predicted y values
print("Predicted values: " + str(yhat[0:5]))
print("Actual Values: "+str(y_test[0:5]))

Predicted values: [0 0 0]
Actual Values: [1 1 1]


In [14]:
####Accuracy Evaluation - (uses jaccard)

from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.9090909090909091
Test set Accuracy:  0.0


In [15]:
#what does this mean? We are good at predicting data within, but not outside data

In [16]:
#let's test a couple of k values and see if get any better results :/
for k in range(1,11):
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
    yhat = neigh.predict(X_test) #predicted y values
    print("Test set Accuracy for k = "+str(k)+": ", metrics.accuracy_score(y_test, yhat))
    print("\n")

Test set Accuracy for k = 1:  0.6666666666666666


Test set Accuracy for k = 2:  0.0


Test set Accuracy for k = 3:  0.0


Test set Accuracy for k = 4:  0.0


Test set Accuracy for k = 5:  0.6666666666666666


Test set Accuracy for k = 6:  0.3333333333333333


Test set Accuracy for k = 7:  0.3333333333333333


Test set Accuracy for k = 8:  0.3333333333333333


Test set Accuracy for k = 9:  0.6666666666666666


Test set Accuracy for k = 10:  0.3333333333333333




In [17]:
#so for k = 1, 3, and 9, we can predict 2/3 of the time? okay not bad. 
    #I'll just go with k = 5 as being the best one
k = 5
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
yhat = neigh.predict(X_test) #predicted y values

print("y actual: \n" + str(y_test[0:10]))
print("\nY predicted: \n"+ str(yhat[0:10]))

X_test

y actual: 
[1 1 1]

Y predicted: 
[1 1 0]


array([[-1.18321596, -0.87979873, -0.0301306 , -0.8660254 ],
       [-1.18321596, -0.56397354,  1.65718288, -0.8660254 ],
       [ 0.        , -0.24814836,  1.02444033,  1.15470054]])

In [18]:
#Accuracy score for k = 5:
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.8181818181818182
Test set Accuracy:  0.6666666666666666


In [19]:
#Now I'm just having fun to see it tells me with my own made up result:
inputList1 = [0,75,80,0]
inputList2 = [1,81,75,0]
#YOU NEED AT LEAST 2 UNIQUE SETS OF DATA YOU WANT TO PREDICT! 
myInput = np.array([inputList1,inputList2]) 
#standardize if before predicting!
myInput = preprocessing.StandardScaler().fit(myInput).transform(myInput.astype(float))
myInput[0:5]
myPrediction = neigh.predict(np.array(myInput))
myPrediction

array([0, 1], dtype=int64)

In [20]:
#so under the first condition, we should NOT play
#and under the second condition, we SHOULD play!