In [1]:
import pandas
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import check_random_state
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
def print_metrics(Method, X, y, folds = 5, average = 'macro'):
  kf = StratifiedKFold(n_splits = folds, random_state = 123, shuffle = True)
  precision = np.zeros(folds)   
  recall = np.zeros(folds)  
  testAc = np.zeros(folds)
  trainAc = np.zeros(folds)
  for i, (trainI, valI) in enumerate(kf.split(X, y)):
    XT, yT = X.loc[trainI].to_numpy(), y.loc[trainI].to_numpy()
    XV, yV = X.loc[valI].to_numpy(), y.loc[valI].to_numpy()
    Method.fit(XT, yT)
    yP = Method.predict(XV)
    yTP = Method.predict(XT)
    precision[i] = precision_score(yV, yP, average = average)
    recall[i] = recall_score(yV, yP, average = average)
    trainAc[i] = accuracy_score(yT, yTP)
    testAc[i] = accuracy_score(yV, yP)    
  print("precision:", precision.mean(), "\nrecall:", recall.mean(), "\n\ntrain_accuracy:", trainAc.mean(), "\ntest_accuracy:", testAc.mean())

In [3]:
class LogRegression():
    def __init__(self, lr=0.01, num_iter=1000):
        self.lr = lr
        self.num_iter = num_iter

    def getIntercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.hstack((intercept, X))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def fit(self, X, y):
        X = self.getIntercept(X)
        self.coef_ = np.zeros(X.shape[1])  # weights

        for _ in range(self.num_iter):
            z = np.dot(X, self.coef_)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.coef_ -= self.lr * gradient

    def predict_prob(self, X):
        X = self.getIntercept(X)
        return self.sigmoid(np.dot(X, self.coef_))

    def predict(self, X):
        return self.predict_prob(X).round()

In [4]:
class KNN():
  def __init__(self, neighbors = 5):
    self.nei = neighbors
    
  def fit(self, X, y):
    self.X = X
    self.y = y.reshape((y.shape[0], 1))

  def Distances(self, p):
    t = self.X - p
    return np.sqrt((t ** 2).sum(1))

  def predict(self, X):
    n = X.shape[0]
    pred = np.zeros((n, 1))
    for i in range(n):
      d = self.Distances(X[i])
      sorted = self.y[np.argsort(d)].flatten()
      if sorted[:self.nei].sum() > self.nei / 2:
        pred[i] = 1.0
    return pred

In [5]:
class Node():
  def __init__(self, predClass):
    self.predClass = predClass
    self.index = 0
    self.threshold = 0
    self.left = None
    self.right = None

class DecisionTree():
  def __init__(self, maxDepth = 1, rf = False):
    self.maxDepth = maxDepth
    self.rf = rf

  def fit(self, X, y, maxFeatures = None):
    self.classes = len(set(y))
    if not self.rf:
      Features = X.shape[1]
    else:
      ind = np.random.choice(X.shape[0], X.shape[0])
      X, y = X[tuple([ind])], y[tuple([ind])]
      if maxFeatures is None:
        Features = np.sqrt(X.shape[1]).astype(int)
      else:
        Features = maxFeatures
    self.features = np.sort(np.random.choice(X.shape[1], Features, replace = False))
    self.tree = self.GrowTree(X, y)

  def predict(self, X):
    list = []
    for inputs in X:
      node = self.tree
      while node.left:
        if inputs[node.featureIndex] < node.threshold:
          node = node.left
        else:
          node = node.right
      list.append(node.predClass)
    return list

  def Split(self, X, y):
    m = y.size
    if m <= 1:
      return None, None
    parent = [np.sum(y == c) for c in range(self.classes)]
    bestGini = 1.0 - sum((n / m) ** 2 for n in parent)
    bestIdx, bestThr = None, None
    for idx in self.features:
      thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
      left = [0] * self.classes
      right = parent.copy()
      for i in range(1, m):
        c = classes[i - 1]
        left[c] += 1
        right[c] -= 1
        giniLeft = 1.0 - sum((left[x] / i) ** 2 for x in range(self.classes))
        giniRight = 1.0 - sum((right[x] / (m - i)) ** 2 for x in range(self.classes))
        gini = (i * giniLeft + (m - i) * giniRight) / m
        if thresholds[i] == thresholds[i - 1]:
          continue
        if gini < bestGini:
          bestGini = gini
          bestIdx = idx
          bestThr = (thresholds[i] + thresholds[i - 1]) / 2
    return bestIdx, bestThr

  def GrowTree(self, X, y, depth = 0):
    samplesPerClass = [np.sum(y == i) for i in range(self.classes)]
    predClass = np.argmax(samplesPerClass)
    node = Node(predClass = predClass)
    if depth < self.maxDepth:
      idx, thr = self.Split(X, y)
      if idx is not None:
        indicesLeft = X[:, idx] < thr
        XLeft, yLeft = X[indicesLeft], y[indicesLeft]
        XRight, yRight = X[~indicesLeft], y[~indicesLeft]
        node.featureIndex = idx
        node.threshold = thr
        node.left = self.GrowTree(XLeft, yLeft, depth + 1)
        node.right = self.GrowTree(XRight, yRight, depth + 1)
    return node

In [6]:
class SVM:
  def __init__(self, C = 1, maxIter = 100, eps = 0.01, randomState = None, verbose = 0):
    self.C = C
    self.maxIter = maxIter
    self.eps = eps
    self.randomState = randomState
    self.verbose = verbose

  def PartialGradient(self, X, y, i):
    g = np.dot(X[i], self.coef.T) + 1
    return g

  def Violation(self, g, y, i):
    smallest = np.inf
    for k in range(g.shape[0]):
      if k == y[i] and self.dualCoef[k, i] >= self.C:
        continue
      elif k != y[i] and self.dualCoef[k, i] >= 0:
        continue
      smallest = min(smallest, g[k])
    return g.max() - smallest

  def Solver(self, g, y, norms, i):
    Ci = np.zeros(g.shape[0])
    Ci[y[i]] = self.C
    beta_hat = norms[i] * (Ci - self.dualCoef[:, i]) + g / norms[i]
    z = self.C * norms[i]
    beta = projection_simplex(beta_hat, z)
    return Ci - self.dualCoef[:, i] - beta / norms[i]

  def fit(self, X, y):
    samples, features = X.shape
    classes = 4
    self.dualCoef = np.zeros((classes, samples), dtype = np.float64)
    self.coef = np.zeros((classes, features))
    norms = np.sqrt(np.sum(X ** 2, axis = 1))
    rs = check_random_state(self.randomState)
    ind = np.arange(samples)
    rs.shuffle(ind)
    violationInit = None
    for it in range(self.maxIter):
      violationSum = 0
      for idx in range(samples):
        i = ind[idx]
        if norms[i] == 0:
          continue
        g = self.PartialGradient(X, y, i)
        v = self.Violation(g, y, i)
        violationSum += v
        if v < 1e-12:
          continue
        delta = self.Solver(g, y, norms, i)
        self.coef += (delta * X[i][:, np.newaxis]).T
        self.dualCoef[:, i] += delta
      if it == 0:
        violationInit = violationSum
      vratio = violationSum / violationInit
      if vratio < self.eps:
        break
    return self

  def predict(self, X):
    decision = np.dot(X, self.coef.T)
    pred = decision.argmax(axis = 1)
    return pred

In [7]:
#для данных по lol
lol = pd.read_csv("lol_clean.csv")
lol.head()

Unnamed: 0.1,Unnamed: 0,blueWins,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience
0,0,0,1,9,6,11,0,0,17210,17039
1,1,0,0,5,5,5,0,0,14712,16265
2,2,0,0,7,11,4,1,1,16113,16221
3,3,0,0,4,5,5,1,0,15157,17954
4,4,0,0,6,6,6,0,0,16400,18543


In [8]:
lol = lol.drop('Unnamed: 0', axis = 1)
lol.head()

Unnamed: 0,blueWins,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience
0,0,1,9,6,11,0,0,17210,17039
1,0,0,5,5,5,0,0,14712,16265
2,0,0,7,11,4,1,1,16113,16221
3,0,0,4,5,5,1,0,15157,17954
4,0,0,6,6,6,0,0,16400,18543


In [9]:
X, y = lol.drop(["blueWins"], axis=1), lol["blueWins"]

In [10]:
%%time
print_metrics(LogRegression(), X, y)

precision: 0.6071437491170971 
recall: 0.511796369567983 

train_accuracy: 0.5128301928596787 
test_accuracy: 0.5127029160047148
Wall time: 2.46 s


In [11]:
%%time
print_metrics(LogisticRegression(), X, y)

precision: 0.7067735429738413 
recall: 0.706562360738132 

train_accuracy: 0.7072072768963358 
test_accuracy: 0.7065501973043613
Wall time: 149 ms


In [12]:
%%time
print_metrics(KNeighborsClassifier(n_neighbors = 4), X, y)

precision: 0.6427686119372595 
recall: 0.6333197794006036 

train_accuracy: 0.7550359629997526 
test_accuracy: 0.6335672116025213
Wall time: 1.39 s


In [13]:
%%time
print_metrics(KNN(neighbors = 4), X, y)

precision: 0.6428597908492639 
recall: 0.6334211992788997 

train_accuracy: 0.7550106593560278 
test_accuracy: 0.63366842617742
Wall time: 37.2 s


In [14]:
%%time
print_metrics(DecisionTree(maxDepth = 5), X, y)

precision: 0.7178974395107575 
recall: 0.7174751458578219 

train_accuracy: 0.7261615982963472 
test_accuracy: 0.7174823451032644
Wall time: 10.8 s


In [15]:
%%time
print_metrics(DecisionTreeClassifier(max_depth = 5), X, y)

precision: 0.7180096730132519 
recall: 0.717575746181546 

train_accuracy: 0.7261615982963472 
test_accuracy: 0.7175835596781632
Wall time: 99 ms


In [16]:
%%time
print_metrics(SVM(), X, y)

precision: 0.2504808076666838 
recall: 0.5 

train_accuracy: 0.5009616345148488 
test_accuracy: 0.5009616153333676
Wall time: 1min 1s


In [17]:
%%time
print_metrics(SVC(decision_function_shape = 'ovr'), X, y)

precision: 0.5000392498484223 
recall: 0.4997248395244404 

train_accuracy: 1.0 
test_accuracy: 0.5006578691128991
Wall time: 50.4 s


In [34]:
#для данных по гостиницам
hotel = pd.read_csv("hotel_clean.csv")
hotel.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,adults,babies,country,market_segment,is_repeated_guest,previous_cancellations,booking_changes,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,0,1,0,342,2,0,135,3,0,0,3,0,2,0,0
1,1,1,0,737,2,0,135,3,0,0,4,0,2,0,0
2,2,1,0,7,1,0,59,3,0,0,0,0,2,0,0
3,3,1,0,13,1,0,59,2,0,0,0,0,2,0,0
4,4,1,0,14,2,0,59,6,0,0,0,0,2,0,1


In [19]:
hotel = hotel.drop('Unnamed: 0', axis = 1)
hotel.head()

Unnamed: 0,hotel,is_canceled,lead_time,adults,babies,country,market_segment,is_repeated_guest,previous_cancellations,booking_changes,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,1,0,342,2,0,135,3,0,0,3,0,2,0,0
1,1,0,737,2,0,135,3,0,0,4,0,2,0,0
2,1,0,7,1,0,59,3,0,0,0,0,2,0,0
3,1,0,13,1,0,59,2,0,0,0,0,2,0,0
4,1,0,14,2,0,59,6,0,0,0,0,2,0,1


In [20]:
X, y = hotel.drop(["is_canceled"], axis=1), hotel["is_canceled"] 

In [21]:
%%time
print_metrics(LogRegression(), X, y)

precision: 0.640674028505252 
recall: 0.5891531724414036 

train_accuracy: 0.5013140716297066 
test_accuracy: 0.5013121224020322
Wall time: 5.89 s


In [22]:
%%time
print_metrics(LogisticRegression(), X, y)

precision: 0.7567563502297128 
recall: 0.7321113879955996 

train_accuracy: 0.7674135019113861 
test_accuracy: 0.7675144372664086
Wall time: 4.18 s


In [23]:
%%time
print_metrics(KNeighborsClassifier(n_neighbors = 4), X, y)

precision: 0.8108862173494711 
recall: 0.7835592151240418 

train_accuracy: 0.8700169026717237 
test_accuracy: 0.8143176859603626
Wall time: 39.7 s


In [35]:
%%time
print_metrics(KNN(neighbors = 4), X, y)

precision: 0.8146185759911052 
recall: 0.7860905332540848 

train_accuracy: 0.872113168395899 
test_accuracy: 0.8171098917798728
Wall time: 14h 59min 44s


In [29]:
%%time
print_metrics(DecisionTree(maxDepth = 5), X, y)

precision: 0.7944830632848223 
recall: 0.7768389637770171 

train_accuracy: 0.8036933815142433 
test_accuracy: 0.8034684073471239
Wall time: 3min 30s


In [26]:
%%time
print_metrics(DecisionTreeClassifier(max_depth = 5), X, y)

precision: 0.7944993331431343 
recall: 0.7768616087045535 

train_accuracy: 0.8036933815142433 
test_accuracy: 0.8034852274976642
Wall time: 637 ms


In [27]:
%%time
print_metrics(SVM(), X, y)

precision: 0.314313468367737 
recall: 0.5 

train_accuracy: 0.6286269364853627 
test_accuracy: 0.628626936735474
Wall time: 11min 18s


In [36]:
%%time
print_metrics(SVC(decision_function_shape = 'ovr'), X, y)

precision: 0.8215742913998507 
recall: 0.7867830078141742 

train_accuracy: 0.846217894342443 
test_accuracy: 0.8203983104307321
Wall time: 1h 52min 17s
