In [None]:
import csv
import numpy as np
import math

# to display the float numbers with 2 decimal points and supress the use of
# scientific notations for small numbers
np.set_printoptions(precision=2, suppress=True)

# You can use X_COLUMN_NAMES and Y_COLUMN_NAME to extract the relevant information from the CSV files
X_COLUMN_NAMES = [
    "age",
    "Medu",
    "Fedu",
    "traveltime",
    "studytime",
    "failures",
    "famrel",
    "freetime",
    "goout",
    "Dalc",
    "Walc",
    "health",
    "absences",
]
Y_COLUMN_NAME = "G3"

# function to load the student dataset into X and y numpy arrays
def load_student_data(filename):
    """
  filename: string, the path of the student-mat.csv dataset
  RETURN
    X: numpy array: shape = [N, D]
    y: numpy array: shape = [N]
  """
    X, y = [], []
    x_index, y_index = [], [] 
     
    ## start your code here
    with open(filename, newline='') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter = ";")
      header = next(csv_reader)
       
      for idx, col in enumerate(header):
        if col in X_COLUMN_NAMES:
          x_index += [idx,]
        if col in Y_COLUMN_NAME:
          y_index += [idx,]
      
      for row in csv_reader:
        x=[]
        for i, j in enumerate(row):
          if i in x_index:
            x += [float(j),]
          if i in y_index:
            y += [float(j),]
        X.append(x)
    X = np.asarray(X)
    y = np.asarray(y)

    return X,y

In [None]:
# driver program to test the load_student_data() function

filename = "/content/student-mat.csv"

X, y = load_student_data(filename)

print(X[1][10], X[100, 12], X[177][12])
print(y[1], y[100], y[177])
print(X.shape, y.shape)

1.0 14.0 4.0
6.0 5.0 6.0
(395, 13) (395,)


In [None]:
# function to standardize the dataset.
 

def standardizeDataset(X):
    """
  X: numpy array, shape = [N,D]
  RETURN
    Xstd: numpy array, shape = [N,D]
  """
    Xstd = np.zeros_like(X)
    ## start your code here
    for i in range(X.shape[1]):
      arr = X[:,i]
      mean = arr.mean()
      std = arr.std()
      standardX = (arr-mean)/ std
      Xstd[:,i] = standardX
   
    ## end
    return Xstd

In [None]:
# driver code to test the standardizeData() function
Xstd = standardizeDataset(X)
print(Xstd.shape)
print(Xstd[10, 1], Xstd[1, 12], Xstd[177, 12])

(395, 13)
1.1438556741642336 -0.21379576818001317 -0.21379576818001317


In [None]:
# function to compute the Euclidean Distance between two samples in the dataset
def euclideanDist(x1, x2):
    """
  x1: numpy array, shape = [D]
  x2: numpy array, shape = [D]
  RETURN
    dist: float value
  """
    dist = 0
    ## start your code here
    dist = np.sqrt(np.sum(np.square(x1-x2)))
        
    ## end
    return dist

In [None]:
# driver code to test the euclideanDist() function
indx = [1, 10, 20, 60, 80, 90, 110, 140, 160, 169]
for i in indx:
    print(euclideanDist(Xstd[1, :], Xstd[i, :]))

0.0
4.896159058868691
4.6033105421051435
5.750017331257644
4.170247533567128
3.633922810555552
4.861589168992987
5.866767578041756
4.6434620083748746
4.367502847616258


In [None]:
# function to get the most similar K neighbors and its classes
def kNearestNeighbors(X, y, Xtest, K):
    """
  X: numpy array, shape = [N, D]
  y: numpy array, shape = [N]
  Xtest: numpy array, shape = [D]
  K: float value
  RETURN
    Xng: numpy array, shape = [K, D]
    yng: numpy array, shape = [K]
  """
    Xng, yng = None, None
    ## start your code here
      ### calculate the distance between Xtest and every sample in X
    Xng, yng = [], []
    dlist = []
    for i in X:
      dist = euclideanDist(Xtest,i)
      dlist += [dist,]
    
      ### get the first K similar X data and the corresponding class value y
    a = np.argsort(dlist)[:K]
    for i in a: 
      Xng += [X[i],]
      yng += [y[i],]
    Xng = np.array(Xng)
    yng = np.array(yng)
    
    ## end
    return Xng, yng

In [None]:
# driver code to test the getNeighbors() function
K = 5
test = 100
Xtest = Xstd[test]
ytest = y[test]

Xng, yng = kNearestNeighbors(Xstd, y, Xtest, K)

# print the K neighbors X and y values
print(Xng)
print(yng)

[[-0.55  1.14  1.36 -0.64 -1.24 -0.45  0.06  1.77  1.7   3.96  2.11  0.32
   1.04]
 [-0.55  1.14  1.36 -0.64 -0.04 -0.45  0.06  0.77  1.7   3.96  2.11  1.04
   1.29]
 [-0.55  0.23  0.44 -0.64 -0.04  0.9   0.06  1.77  1.7   2.83  1.33  1.04
  -0.21]
 [ 0.24  1.14  1.36 -0.64 -0.04 -0.45  1.18 -0.24  1.7   2.83  2.11 -0.4
   0.91]
 [ 0.24  1.14  0.44  0.79 -0.04 -0.45  0.06  0.77  0.8   2.83  1.33  0.32
  -0.21]]
[ 5. 11. 12. 13.  9.]


In [None]:
# function to implement KNN classifier a given test case, i.e., you will predict
# the grade of a student, given its 13 attributes.


def KNNClassifier(X, y, Xtest, K):
    """
  X: shape = [N, D]
  y: shape = [N]
  Xtest: shape
  K: float value
  RETURN
    output_class: float value from {1, 2, 3}
  """
    output_class = None
    ## start your code here
    Xng,yng = kNearestNeighbors(Xstd,y,Xtest, K)
    output_class = yng[0]
    
    ## end
    return output_class

In [None]:
# load the original training data

X, y = load_student_data(filename)

# standardize the data
Xstd = standardizeDataset(X)

# We shall consider the last 10 data points from the dataset as our test data

# split the X and y from the test data
Xtest = Xstd[-10:, :]
ytest = y[-10:]

# compute final grade for the students in the test data using KNN
K = 3
predictions = np.empty(len(ytest))

for i in range(Xtest.shape[0]):
    output = KNNClassifier(Xstd, y, Xtest[i], K)
    predictions[i] = output

print("Predicted class for test data by KNN: ", predictions)
print("Actual class for test data from dataset: ", ytest)

Predicted class for test data by KNN:  [10.  6.  0.  8.  0.  9. 16.  7. 10.  9.]
Actual class for test data from dataset:  [10.  6.  0.  8.  0.  9. 16.  7. 10.  9.]


In [None]:
# function to calculate the accuaracy of prediction in percentage
def accuracy_percentage(actual_class, predicted_class):
    """
  actual_class: numpy array, shape = [N]
  prediceted_class: numpy array, shape = [N]
  RETURN
    percent: float value
  """
    percent = 0
    ## start your code here
    accurate = 0
    i=0
    while i <len(actual_class):
      if actual_class[i] == predicted_class[i]:
        accurate += 1
      i+=1
    percent = (accurate/ len(predicted_class)) * 100 
    
    
    ## end
    return percent

In [None]:
print("Accuracy {}%".format(accuracy_percentage(predictions, ytest)))

Accuracy 100.0%


In [None]:
# load the original training data
X, y = load_student_data(filename)

# standardize the data
Xstd = standardizeDataset(X)

# randomly choosen data from X and Xstd dataset
# X - dataset that is not standardized
# Xstd - standardized dataset
# In both cases, the class value y is unchanged
random_indx = np.asarray([9, 153, 91, 29, 20, 10, 138, 130, 1, 11, 25, 137, 120])
testX = X[random_indx]
testXstd = Xstd[random_indx]
testy = y[random_indx]


# compute final grade for the students in the test data using KNN
K = 3
predictedNoStd = np.empty(len(testy))
predictedStd = np.empty(len(testy))

# predict the classes for test data with standardization of input and test data
# predictedNoStd - has the classes predicted for test data without standardization
# predictedStd - has the classes predicted for test data with standardization

# call KNN without standardized dataset and test data testX. Record the predicated
# class in predictedNoStd numpy array

# you need to write your code here
count = 0
for test in testX:
    predictedNoStd[count] = KNNClassifier(X, y, test, K)
    count += 1

# call KNN with standardized dataset and test data testXStd. Record the predicated
# class in predictedStd numpy array

# you need to write your code here
count = 0
for test in testXstd:
    predictedStd[count] = KNNClassifier(Xstd, y, test, K)
    count += 1

# print the classes predicted classes and the actual classes for the test data
print("Predicted class with standardization: ", predictedStd)
print("Predicted class without standardization: ", predictedNoStd)
print("Actual class for test data: ", testy)

# print the accuracy of KNN with and without standardizing dataset
# you need to write your code here
print(
    "Accuracy of KNN with standardization: ", accuracy_percentage(testy, predictedStd)
)
print(
    "Accuracy of KNN without standardization: ",
    accuracy_percentage(testy, predictedNoStd),
)

Predicted class with standardization:  [15.  0. 18. 11. 15.  9. 12.  0.  6. 12.  8.  0. 15.]
Predicted class without standardization:  [18.  8.  8.  9. 15.  8.  9.  9.  8. 15.  9. 15.  8.]
Actual class for test data:  [15.  0. 18. 11. 15.  9. 12.  0.  6. 12.  8.  0. 15.]
Accuracy of KNN with standardization:  100.0
Accuracy of KNN without standardization:  7.6923076923076925
