In [73]:
# task 5
import numpy as np

# Максимизация правдоподобия

class LogisticRegression:
  def __init__(self, bias=False, iterations = 1000, learning_rate = 0.01):
    self.bias, self.iterations, self.learning_rate = bias, iterations, learning_rate
    self.sigmoid = lambda z : 1 / (1 + np.exp(-z))
    self.gradient = lambda x, y, coef_ : x.T @ (self.sigmoid(x @ coef_) - y)

  def get_params(self):
    return {
        'learning_rate': self.learning_rate,
        'iterations': self.iterations,
        'bias': self.bias
    }

  def set_params(self, **params):
      for key, value in params.items():
          setattr(self, key, value)
      return self

  def log_loss(self, x, y):
    y_pred = self.sigmoid( x @ self.coef_)
    return -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)).mean()

  def fit(self, x, y):
    if self.bias: 
      np.insert(x, 0, 1, axis=1)
    self.coef_ = np.zeros_like(x[0])

    for _ in range(self.iterations):
      self.coef_ -= self.learning_rate * self.gradient(x, y, self.coef_)
    return self

  def predict(self, x):
    if self.bias: 
      np.insert(x, 0, 1, axis=1)
    return np.where(self.sigmoid(x @ self.coef_) > 0.5, 1, 0)

  def clone(self):
    return LogisticRegression(self.bias, self.iterations, self.learning_rate)

  def statistics(self, x, y):
    out = self.predict(x)
    outputs = {'False': 1, 'True': 0}
    tp, tn, fp, fn = 0, 0, 0, 0
    for out_i, y_i in zip(out, y):
      if out_i == outputs['True']:
        if y_i == outputs['True']:
          tp += 1
        else:
          fp += 1
      else:
        if y_i == outputs['True']:
          fn += 1
        else:
          tn += 1
    return {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}

  def score(self, x, y):
    stats = self.statistics(x, y)
    return (stats['TP'] + stats['TN']) / (stats['TP'] + stats['TN'] + stats['FP'] + stats['FN'])

  def accuracy(self, x, y):
    stats = self.statistics(x, y)
    return (stats['TP'] + stats['TN']) / (stats['TP'] + stats['TN'] + stats['FP'] + stats['FN'])

  def precision(self, x, y):
    stats = self.statistics(x, y)
    return stats['TP'] / (stats['TP'] + stats['FP'])

  def recall(self, x, y):
    stats = self.statistics(x, y)
    return stats['TP'] / (stats['TP'] + stats['FN'])

  def f1_score(self, x, y):
    precision = self.precision(x, y)
    recall = self.recall(x, y)
    return 2 * (precision * recall) / (precision + recall)

  def confusion_matrix(self, x, y):
    stats = self.statistics(x, y)
    return np.array([[stats['TN'], stats['FP']], [stats['FN'], stats['TP']]])


In [74]:
class LogisticRegressionNewton(LogisticRegression):
  def __init__(self, bias=False, iterations = 10000):
    super().__init__(bias, iterations)
    self.hessian = lambda x, coef_: x.T @ np.diag(self.sigmoid(x @ coef_) * (1 - self.sigmoid(x @ coef_))) @ x

  def fit(self,x,y):
    if self.bias: 
      np.insert(x, 0, 1, axis=1)
    self.coef_ = np.zeros_like(x[0])
    for _ in range(self.iterations):
      self.coef_ -= np.linalg.pinv(self.hessian(x, self.coef_)) @ self.gradient(x, y, self.coef_)
    return self

  def get_params(self, deep=True):
    return {
        'iterations': self.iterations,
        'bias': self.bias
    }

  def clone(self):
    return LogisticRegressionNewton(self.bias, self.iterations)


In [75]:
class LogisticRegressionAdaDelta(LogisticRegression):
  def __init__(self, bias=False, iterations = 10000, epsilon = 1e-8):
    super().__init__(bias, iterations)
    self.epsilon = epsilon

  def fit(self, x, y):
    if self.bias: 
      np.insert(x, 0, 1, axis=1)
    self.coef_ = np.zeros_like(x[0])
    G = np.zeros_like(x[0])

    for _ in range(self.iterations):
      grad  = self.gradient(x, y, self.coef_)
      G += grad ** 2
      adaptive_lr = self.learning_rate / (np.sqrt(G) + self.epsilon)
      self.coef_ -= adaptive_lr * grad
    return self

  def get_params(self, deep=True):
    return {
        'iterations': self.iterations,
        'bias': self.bias,
        'epsilon': self.epsilon
    }

  def clone(self):
    return LogisticRegressionAdaDelta(self.bias, self.iterations, self.epsilon)

In [76]:
#task 1-4

In [77]:
import pandas as pd

In [78]:
data = pd.read_csv("./train.csv").dropna()

In [79]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [80]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,78.682469
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,76.347843
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292


In [81]:
data["Cabin"] = data["Cabin"].apply(lambda x: x.split()[0][0])
x = pd.get_dummies(data.drop(columns = ["Name" ,"Ticket", "Survived"]), columns=["Cabin","Sex","Embarked"]).astype(float).values
y = data["Survived"].values

In [82]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    n_samples = len(X)
    n_test = int(n_samples * test_size)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test


In [83]:
def cross_val_score(model, x, y, cv = 5):
  estimator = model.clone()
  scores = list()
  for _ in range(cv):
    estimator = model.clone()
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    estimator.fit(x_train, y_train)
    scores.append(estimator.score(x_test, y_test))
  return scores




In [84]:

def grid_search_custom(model, param_grid, X, y, cv=5):
    from itertools import product
    param_combinations = list(product(*param_grid.values()))
    param_names = list(param_grid.keys())
    dct = dict()

    for params in param_combinations:
        param_dict = dict(zip(param_names, params))

        model.set_params(**param_dict)

        scores = cross_val_score(model, X, y, cv=cv)
        mean_score = np.mean(scores)
        dct[params] = (model, mean_score)
    return dct




In [85]:
model = LogisticRegression()
param_grid = {
    'lerning_rate': [0.1, 0.01, 0.001],
    'iterations': [100, 200, 500, 1000],
}
grid_search_custom(model, param_grid, x, y)

  self.sigmoid = lambda z : 1 / (1 + np.exp(-z))


{(0.1, 100): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.6055555555555555)),
 (0.1, 200): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.5944444444444444)),
 (0.1, 500): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.6722222222222223)),
 (0.1, 1000): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.5611111111111111)),
 (0.01, 100): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.4833333333333334)),
 (0.01, 200): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.6277777777777779)),
 (0.01, 500): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.5111111111111112)),
 (0.01, 1000): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.5333333333333333)),
 (0.001, 100): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.6555555555555556)),
 (0.001, 200): (<__main__.LogisticRegression at 0x10ff21a90>,
  np.float64(0.6055555555555555)),
 (0.001, 500): (<__main__.LogisticRegres

In [86]:
model = LogisticRegressionNewton()
param_grid = {
    'lerning_rate': [ 0.01, 0.001],
    'iterations': [1 ,3 ,10],
}
grid_search_custom(model, param_grid, x, y)


{(0.01, 1): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.7722222222222221)),
 (0.01, 3): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.7055555555555556)),
 (0.01, 10): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.7555555555555555)),
 (0.001, 1): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.7055555555555555)),
 (0.001, 3): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.711111111111111)),
 (0.001, 10): (<__main__.LogisticRegressionNewton at 0x10ff21be0>,
  np.float64(0.8055555555555556))}

In [87]:
#task 6

In [88]:
model = LogisticRegression()
model.fit(x, y)
param_grid = {
    'lerning_rate': [0.1, 0.01, 0.001],
    'iterations': [100, 200, 500, 1000],
}
scores = grid_search_custom(model, param_grid, x, y)
best_model, score = max(scores.values(), key = lambda x: x[1])
print(score)
print("GD accuracy: %s, precision: %s, recall: %s, F1-Score: %s" %
 (best_model.accuracy(x,y), best_model.precision(x,y), best_model.recall(x,y), best_model.f1_score(x,y)))


  self.sigmoid = lambda z : 1 / (1 + np.exp(-z))


0.6722222222222223
GD accuracy: 0.6065573770491803, precision: 0.4375, recall: 0.7, F1-Score: 0.5384615384615384


In [89]:
model = LogisticRegressionNewton()
param_grid = {
    'iterations': [10, 30, 50, 100]
}
scores = grid_search_custom(model, param_grid, x, y)
best_model, score = max(scores.values(), key = lambda x: x[1])
print(score)

0.7555555555555555


In [90]:
model = LogisticRegressionNewton()
model.fit(x, y)
param_grid = {
    'iterations': [10, 30, 50, 100]
}
scores = grid_search_custom(model, param_grid, x, y)
best_model, score = max(scores.values(), key = lambda x: x[1])
print(score)
print("GD accuracy: %s, precision: %s, recall: %s, F1-Score: %s" %
 (best_model.accuracy(x,y), best_model.precision(x,y), best_model.recall(x,y), best_model.f1_score(x,y)))


0.7833333333333334
GD accuracy: 0.8306010928961749, precision: 0.7543859649122807, recall: 0.7166666666666667, F1-Score: 0.735042735042735
