In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 
from sklearn.utils import shuffle 
from collections import Counter 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
import keras
from keras.utils import to_categorical
import os
from keras.layers import *
from keras.models import Sequential, Model
from keras import optimizers

In [None]:
def parse_arff(filename):
	try:
		file = open(filename)
	except:
		print('file could not found or opened, try with file location')
	columns = []
	data = []

	for line in file:
		if line.startswith('@attribute'):
			line = line.strip()
			temp = line.split()
			columns.append(temp[1])
		elif line.startswith('@') or line.startswith('\n'):
			continue
		else:
			line = line.strip()
			temp = line.split(',')
			data.append(temp)

	return data,columns

In [None]:
data, columns = parse_arff('drive/MyDrive/content/Training Dataset.arff')
df = pd.DataFrame(data, columns = columns, dtype=np.int8)
x_train, x_test, y_train, y_test = train_test_split(df[df.columns[:-1]],df[df.columns[-1]])

In [None]:
x_train = np.array(x_train).astype(np.int8)
y_train = np.array(y_train).astype(np.int8)
x_test = np.array(x_test).astype(np.int8)
y_test = np.array(y_test).astype(np.int8)

In [None]:
x_train = to_categorical(x_train)
x_test = to_categorical(x_test)

In [None]:
y_train = np.where(y_train == -1, 0, y_train)
y_test = np.where(y_test == -1, 0, y_test)

In [None]:
loss_fn = keras.losses.Huber(delta=1.0)
opt1 = keras.optimizers.SGD(lr=0.01, momentum=0.9)
optimizers = ['sgd', 'rmsprop', 'adam', 'adadelta', 'adagrad', 'adamax', 'nadam', 'ftrl', 'opt1']
activation = ['softplus', 'softsign', 'selu', 'elu', 'exponential', 'tanh', 'sigmoid', 'relu']
binary_loss = ['binary_crossentropy', 'categorical_crossentropy', 'hinge', 'squared_hinge', 'loss_fn']

In [None]:
keras_models = []
for act in activation:
  for acti in activation:
    model = keras.Sequential()
    model.add(Flatten(input_shape=(30,2)))
    model.add(Dense(64,activation=act))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation=acti))
    keras_models.append(model)

In [None]:
model_hist = []
for m in keras_models:
  for opt in optimizers:
    for loss in binary_loss:
      m.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
      modelhistory = m.fit(x_train, y_train, epochs = 10, validation_data=(x_test, y_test))
      acc = [modelhistory.history['loss'], modelhistory.history['val_loss'], modelhistory.history['accuracy'], modelhistory.history['val_accuracy']]
      model_hist.append(acc)

In [None]:
len(keras_models)*len(optimizers)*len(binary_loss)

3840

In [None]:
len(model_hist)

768

In [None]:
model_hist[0][3][3]

0.7586830854415894

In [None]:
accmax = []
for m,arg in zip(model_hist, argmax):
  accmax.append(m[3][arg])

In [None]:
accmax[264]

0.7673661112785339

In [None]:
model = keras.Sequential()
model.add(Flatten(input_shape=(30,2)))
model.add(Dense(32,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(512,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
opt = keras.optimizers.SGD(lr=0.001, momentum=0.9)
activation = ['softplus', 'softsign', 'selu', 'elu', 'exponential', 'tanh', 'sigmoid', 'relu']
optimizers = ['sgd', 'rmsprop', 'adam', 'adadelta', 'adagrad', 'adamax', 'nadam', 'ftrl', opt1, opt2]
binary_loss = ['binary_crossentropy', 'categorical_crossentropy', 'hinge', 'squared_hinge', loss_fn]

In [None]:
model = keras.Sequential()
model.add(Flatten(input_shape=(30,2)))
model.add(Dense(64,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(1,activation='softplus'))

In [None]:
model_hist = []
for opt in optimizers:
  for loss in binary_loss:
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
    modelhistory = model.fit(x_train, y_train, epochs = 7, validation_data=(x_test, y_test))
    acc = [modelhistory.history['accuracy'][np.argmax(modelhistory.history['accuracy'])], modelhistory.history['val_accuracy'][np.argmax(modelhistory.history['val_accuracy'])]]
    model_hist.append(acc)

In [None]:
np.argmax(model_hist.T[0]), np.argmin(model_hist.T[0]), np.argmax(model_hist.T[1]), np.argmin(model_hist.T[1]), model_hist.T[0][10], model_hist.T[0][12], model_hist.T[1][10],model_hist.T[1][11]

(10,
 12,
 10,
 11,
 0.7502110600471497,
 0.4459052085876465,
 0.7659189701080322,
 0.43451520800590515)

In [None]:
model_hist[10], model_hist[11], model_hist[12], len(model_hist), sum(model_hist.T[0])/len(model_hist.T[0]), sum(model_hist.T[1])/len(model_hist.T[1])

(array([0.75021106, 0.76591897]),
 array([0.44855869, 0.43451521]),
 array([0.44590521, 0.43451521]),
 45,
 0.5338640332221984,
 0.534394606616762)

In [None]:
i = 0
for opt in optimizers:
  for loss in binary_loss:
    i+=1
    if i == 10 or i == 11 or i == 12:
      print(opt, loss)

rmsprop loss_fn
adam binary_crossentropy
adam categorical_crossentropy


In [None]:
model_hist

array([[0.74345678, 0.75759768],
       [0.66156071, 0.66968161],
       [0.7408033 , 0.75904489],
       [0.64648414, 0.70694643],
       [0.7477988 , 0.76157743],
       [0.74635148, 0.76555717],
       [0.70546377, 0.71056437],
       [0.74671328, 0.76085383],
       [0.64383066, 0.67329955],
       [0.74852252, 0.76447177],
       [0.75021106, 0.76591897],
       [0.44855869, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.44590521, 0.43451521],
       [0.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_acc = []
for weight in class_weight:
  for max in max_features:
    for cri in criterion:
      for s in warm_start:
        random_acc.append(accuracy_score(y_test, RandomForestClassifier(warm_start = s, class_weight=weight, max_features = max , criterion = cri).fit(x_train, y_train).predict(x_test)))

In [None]:
sum(random_acc) / len(random_acc), len(random_acc), random_acc[47], random_acc[16]

(0.966074228171732, 48, 0.9696092619392185, 0.961287988422576)

In [None]:
max_features = [None, 'auto', 'sqrt', 'log2']
criterion = ['gini', 'entropy']
splitter = ['best', 'random']
bootstrap  = [True, False]
class_weight = [None, 'balanced', 'balanced_subsample']
warm_start = [True, False]

In [None]:
t = 0
for max in max_features:
  for cri in criterion:
    for cla in class_weight:
      for w in warm_start:
        t+=1
        if t == 16 or t == 47:
          print(max, cri, cla, w)

auto gini balanced False
log2 entropy balanced_subsample True


In [None]:
tree_models = []
for weight in class_weight[:-1]:
  for max in max_features:
    for cri in criterion:
      for split in splitter:
        tree_models.append(DecisionTreeClassifier(class_weight = weight, max_features=max, splitter = split, criterion = cri).fit(x_train, y_train))

In [None]:
tree_acc = []
for model in tree_models:
  tree_acc.append(accuracy_score(y_test, model.predict(x_test)))

In [None]:
sum(tree_acc) / len(tree_acc)

0.9573873914616499

In [None]:
np.argmax(tree_acc), np.argmin(tree_acc)

(1, 15)

In [None]:
tree_acc

[0.9609261939218524,
 0.9645441389290883,
 0.9623733719247467,
 0.9630969609261939,
 0.9551374819102749,
 0.9511577424023154,
 0.9558610709117221,
 0.9598408104196816,
 0.9522431259044862,
 0.9558610709117221,
 0.9645441389290883,
 0.9518813314037626,
 0.9562228654124457,
 0.9511577424023154,
 0.9551374819102749,
 0.9482633863965267,
 0.9598408104196816,
 0.9620115774240231,
 0.9645441389290883,
 0.9634587554269175,
 0.9569464544138929,
 0.9591172214182344,
 0.9573082489146165,
 0.9580318379160637,
 0.9576700434153401,
 0.9522431259044862,
 0.9598408104196816,
 0.9497105643994211,
 0.9565846599131693,
 0.9576700434153401,
 0.9544138929088278,
 0.9587554269175108]

In [None]:
class KNN:
	def __init__(self, n_neighbors = 3):
		self.n_neighbors = n_neighbors

	def fit(self, features, labels, formula = 'euclidean'):
		self.features = features
		self.labels = labels
		self.formula = formula
		self.distances = {}
		self.distance = []

	def euclidean(self, feature, y):
		#sqrt(sum((x - y)^2))
		return np.linalg.norm(np.array(feature)-np.array(y))


	def predict(self, x_test):
		if self.formula == 'euclidean':
			for feature, index in zip(self.features,np.arange(len(self.features))):
				self.distance.append([self.euclidean(feature,x_test), self.labels[index][0]])

			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]

			return prediction

		elif self.formula == 'manhattan':
			for feature, index in zip(self.features,len(self.features)):
				self.distance.append([manhattan(feature,x_test),self.labels[index]])

			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]

			return prediction

		elif self.formula == 'chebsyhey':
			for feature, index in zip(self.features,len(self.features)):
				self.distance.append([chebsyhey(feature,x_test),self.labels[index]])
			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]

			return prediction

		elif self.formula == 'minkowski':
			for feature, index in zip(self.features,len(self.features)):
				self.distance.append([minkowski(feature,x_test, 1),self.labels[index]])

			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]
			return prediction

		elif self.formula == 'wminkowski':
			for feature, index in zip(self.features,len(self.features)):
				self.distance.append([wminkowski(feature,x_test, 1, np.ones(feature.shape)),self.labels[index]])

			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]
			return prediction

		elif self.formula == 'seuclidean':
			for feature, index in zip(self.features,len(self.features)):
				self.distance.append([seuclidean(feature,x_test, 2),self.labels[index]])

			votes = [i[1] for i in sorted(self.distance)[:self.n_neighbors]]
			prediction = Counter(votes).most_common(1)[0][0]
			return prediction



	
	def manhattan(self, feature, y):
		#sum(|x - y|)
		return np.sum(np.abs(np.array(feature)-np.array(y)))

	def chebsyhey(self, feature, y):
		#max(|x - y|)
		return np.max(np.abs(np.array(feature)-np.array(y)))

	def minkowski(self, feature, y, p):
		#sum(|x - y|^p)^(1/p)
		return np.pow(np.sum(np.pow(np.abs(np.array(feature)-np.array(y))), p), 1/p)


	def wminkowski(self, feature, y, p, w):
		#sum(|w * (x - y)|^p)^(1/p)
		return np.pow(np.pow(np.abs(np.dot(w,np.array(feature)-np.array(y))), p),1/p)
		
	def seuclidean(self, feature, y, V):
		#sqrt(sum((x - y)^2 / V))
		return np.sqrt(np.pow(np.sum(np.array(feature)-np.array(y)),2) / V)


In [None]:
knn = KNN()
knn.fit(x_train, y_train)

In [None]:
ypred = []
for x in x_test:
  ypred.append(knn.predict(x))

KeyboardInterrupt: ignored

In [None]:
len(ypred)

1182

In [None]:
accuracy_score(y_test[:1182], ypred)

0.4323181049069374