# 0. Libraries

In [301]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts
from scipy.spatial.distance import cdist
import seaborn as sns


from sklearn import datasets
from sklearn.datasets import make_regression, make_classification
from sklearn.base import BaseEstimator
from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, mean_squared_error

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

# 0.1 Batch Generator

In [302]:
def batch_generator(X, y, shuffle_=True, batch_size=1):
    """
    Batch generator
    X          - features matrix
    y_batch    - answers vector
    shuffle    - is it necessary to randomly shuffle the selection
    batch_size - batch ( 1 for SGD, > 1 for mini-batch GD)
    Generates subsample for an interation of descent (X_batch, y_batch)
    """
    n = X.shape[0]
    if shuffle_:
        X, y = shuffle(X, y)
    for i in range(0, n, batch_size):
        try:
            X_batch, y_batch = X[i:i + batch_size], y[i:i + batch_size]
        except IndexError:
            X_batch, y_batch = X[i::], y[i::]
        yield (X_batch, y_batch)

# 1. k-Nearest neighbors classifier

In [303]:
class MyKNeighborsClassifier(BaseEstimator):
    """kNN classifier as in sklearn.
        n_neighbors -> int,
        metric -> {‘canberra’, 
        ‘chebyshev’, 
        ‘cityblock’, 
        ‘correlation’, 
        ‘cosine’, 
        ‘dice’,
        ‘euclidean’,
        ‘hamming’,
        ‘jaccard’,
        ‘jensenshannon’, 
        ‘kulczynski1’,
        ‘mahalanobis’, 
        ‘matching’, 
        ‘minkowski’,
        ‘rogerstanimoto’, 
        ‘russellrao’,
        ‘seuclidean’, 
        ‘sokalmichener’, 
        ‘sokalsneath’, 
        ‘sqeuclidean’,
        ‘yule’}
    """
    def __init__(self, n_neighbors, algorithm='brute', metric = 'euclidean'):
        self.n_neighbors = n_neighbors
        self.metric = metric
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        distance_matrix = cdist(X, self.X_train, metric = self.metric)
        ind_matrix = np.argsort(distance_matrix, axis = 1)[:, :self.n_neighbors]
        result = [np.argmax(np.bincount(self.y_train[ind_matrix][k])) for k in range(len(ind_matrix))]
        return np.array(result)
    

# 1.1 Test 

In [304]:
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1, stratify=iris.target)

In [305]:
clf = KNeighborsClassifier(n_neighbors=2, algorithm='brute')
my_clf = MyKNeighborsClassifier(n_neighbors=2, algorithm='brute', metric = 'cosine')

In [306]:
clf.fit(X_train, y_train)
my_clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_mypred = my_clf.predict(X_test)

In [307]:
print(f"My f1-score = {f1_score(y_test, y_mypred, average='macro')}")
print(f"Scikit-learn f1-score = {f1_score(y_test, y_pred, average='macro')}")

My f1-score = 1.0
Scikit-learn f1-score = 0.9326599326599326


# 2. Linear Regression

In [308]:
class MyLinearRegression():
    def __init__(self, alpha=0.001, n_iters=1000):
        self.weights = None
        self.alpha = alpha
        self.n_iters = n_iters
        
    def gradient_step(self, weights, weights_grad):
        return weights - self._alpha * weights_grad
             
            
    def grad_func(self, X_batch, y_batch, B):
        return 2 * X_batch.T.dot(np.dot(X_batch, self.weights) - y_batch) / B
    
    def fit(self, X, y):
        B = 1
        n, m = X.shape
        self.weights = np.random.normal(size=(m+1,))
        ones_X = np.concatenate((np.ones((n, 1)), X), axis=1)
        for i in range(self.n_iters):
            for X_batch, y_batch in batch_generator(ones_X, y, batch_size=5):
                grad = self.grad_func(X_batch, y_batch, B) 
                self.weights -= self.alpha * grad
                
    def predict(self, X):
        n, m = X.shape
        ones_X = np.concatenate((np.ones((n, 1)), X), axis=1)
        return np.dot(ones_X, self.weights)

# 2.1 Test

In [309]:
X, y, coef = make_regression(n_samples = 10_000, n_features=5, n_informative = 5, noise = 0.4, coef=True, bias = 10)

In [310]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [311]:
reg = LinearRegression()
reg.fit(X_train, y_train)
reg_pred = reg.predict(X_test)
print(f"w = {reg.coef_}\nb = {reg.intercept_}")
print(f"MSE = {mean_squared_error(reg_pred, y_test):.6}")

w = [82.81040567 63.14581477 41.10830795 91.81265587 69.7716958 ]
b = 10.003370547754722
MSE = 0.164167


In [312]:
myreg = MyLinearRegression()
myreg.fit(X_train, y_train)
reg_pred = myreg.predict(X_test)
print(f"w = {myreg.weights}\n")
print(f"MSE = {mean_squared_error(reg_pred, y_test):.6}")

w = [ 9.97575072 82.82760533 63.14873993 41.11977718 91.81230787 69.75720842]

MSE = 0.164221


# 3 Logisctic Regression

In [313]:
class MyLogisticRegression():
    def __init__(self, alpha=0.001, n_iters=1000):
        self.weights = None
        self.alpha = alpha
        self.n_iters = n_iters
        
    def gradient_step(self, weights, weights_grad):
        return weights - self._alpha * weights_grad
          
            
    def grad_func(self, X_batch, y_batch, B):
        return 2 * X_batch.T.dot(self.sigmoid(np.dot(X_batch, self.weights)) - y_batch)/ B
    
    
    def fit(self, X, y):
        B = 2
        n, m = X.shape
        self.weights = np.random.normal(size=(m+1,))
        ones_X = np.concatenate((np.ones((n, 1)), X), axis=1)
        for i in range(self.n_iters):
            for X_batch, y_batch in batch_generator(ones_X, y, batch_size=B):
                grad = self.grad_func(X_batch, y_batch, B) 
                self.weights -= self.alpha * grad
      
    def sigmoid(self, X):  
        return 1. / (1. + np.exp(-X))
    
    
    def predict_proba(self, X):
        return self.sigmoid(np.dot(X, self.weights))
    

    def predict(self, X, threshold=0.5):
        n, m = X.shape
        ones_X = np.concatenate((np.ones((n, 1)), X), axis=1)
        y_pred = self.predict_proba(ones_X) > threshold
        return y_pred

# 3.1 Тест

In [314]:
X, y = make_classification(n_samples = 10_000, n_features=3, n_redundant=0)

In [315]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [316]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"w = {clf.coef_}\nb = {clf.intercept_}")
print(f"f1-score = {f1_score(y_pred, y_test):.6}")

w = [[-2.93248428e-03 -1.32504511e-02  3.19619315e+00]]
b = [-0.09591955]
f1-score = 0.914767


In [317]:
myclf = MyLogisticRegression()
myclf.fit(X_train, y_train)
y_pred = myclf.predict(X_test)
print(f"w = {myclf.weights}\n")
print(f"f1-score = {f1_score(y_pred, y_test):.6}")

w = [-0.09653199 -0.00454937 -0.01407133  3.21411689]

f1-score = 0.914767
