# LazyFCA

Домашнее задание по теории решеток

Датасет tik-tak-toe

Андрей Упшинский, М05-895в

# Описание

Для оценки качества используется KFold (разбиение уже сделано в источнике датасета)

Сначала преобразуем признаки в бинарные, используя OneHotEncoding

In [64]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder


def preprocess(data):
    y = (data.V10 == 'positive')
    
    encoder = OneHotEncoder(dtype='bool')
    X = encoder.fit_transform(data[['V{}'.format(i) for i in range(1, 10)]]).toarray()
    
    return X, y


def read_sample(path):
    data = pd.read_csv(path)
    return preprocess(data)

Попробуем стандартные методы классификации из sklearn (`LogisticRegression`, `RandomForestClassifier`) и `XGBClassifier`

In [90]:
def test_model(model, folds):
    accuracy, precision, recall = [], [], []
    
    for X_train, X_test, y_train, y_test in folds:
        model.fit(X_train, y_train)
        y = model.predict(X_test)
        
        accuracy.append(accuracy_score(y_test, y))
        precision.append(precision_score(y_test, y))
        recall.append(recall_score(y_test, y))
    
    return accuracy, precision, recall

In [91]:
folds = []

for i in range(1, 11):
    train_path = 'data/train{}.csv'.format(i)
    test_path = 'data/test{}.csv'.format(i)
    
    X_train, y_train = read_sample(train_path)
    X_test, y_test = read_sample(test_path)
    folds.append((X_train, X_test, y_train, y_test))

In [109]:
from sklearn.linear_model import LogisticRegression


accuracy, precision, recall = test_model(LogisticRegression(solver='lbfgs'), folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

precision: 0.975
recall: 1.000
accuracy: 0.983


In [110]:
from sklearn.ensemble import RandomForestClassifier


accuracy, precision, recall = test_model(RandomForestClassifier(n_estimators=100, random_state=501), 
                                         folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

precision: 0.987
recall: 0.997
accuracy: 0.990


In [111]:
from xgboost import XGBClassifier

accuracy, precision, recall = test_model(XGBClassifier(max_depth=5), folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

precision: 0.984
recall: 0.995
accuracy: 0.986


Выглядит крайне многообещающе, лучшая точность - $0.99$ у `RandomForestClassifier`. 

Попробуем хотя бы приблизиться. Будем извлекать признаки на основе генераторов, потом предсказывать по ним с помощью `RandomForestClassifier` ответ

Признаки:
средние $|g'\cap g_i^+|$, $|g'\cap g_i^-|$,
        $|(g'\cap g_i^+)^+|$, $|(g'\cap g_i^+)^-|$, $|(g'\cap g_i^-)^-|$, $|(g'\cap g_i^-)^+|$

In [179]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


def context_split(X, y):
    return X[y], X[~y]


def avg_intersect(sample, context):
    return np.sum(context & sample) / len(context)


def avg_clojure(sample, context, context_clojure):
    intersect = context & sample
    return np.array([(np.sum(context_clojure & inters, axis=1) == np.sum(inters)).sum() 
                     for inters in intersect]).mean()


class Classifier:
    def __init__(self, classifier, classifier_train_part=0.5, random_state=None):
        self.random_state = random_state
        self.classifier = classifier
        self.classifier_train_part = classifier_train_part
    
    def get_features(self, X):
        features = []
        
        for x in X:
            features.append((avg_intersect(x, self.positive), 
                             avg_intersect(x, self.negative), 
                             avg_clojure(x, self.positive, self.negative),
                             avg_clojure(x, self.positive, self.positive),
                             avg_clojure(x, self.negative, self.positive), 
                             avg_clojure(x, self.negative, self.negative)))
        
        return np.array(features)
    
    def fit(self, X, y):
        X_true, X_train, y_true, y_train = train_test_split(X, y, 
                                                            test_size=self.classifier_train_part, 
                                                            random_state=self.random_state)
        
        self.positive, self.negative = context_split(X_true, y_true)
        
        features = self.get_features(X_train)
        self.classifier.fit(features, y_train)
    
    def predict(self, X):
        features = self.get_features(X)
        return self.classifier.predict(features)

In [180]:
accuracy, precision, recall = test_model(Classifier(random_state=501, 
                                                    classifier=XGBClassifier(n_estimators=100, 
                                                                             max_depth=5), 
                                                    classifier_train_part=0.2), folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

precision: 0.907
recall: 0.974
accuracy: 0.917


In [None]:
accuracy, precision, recall = test_model(Classifier(random_state=501, 
                                                    classifier=RandomForestClassifier(n_estimators=100), 
                                                    classifier_train_part=0.2), folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

In [175]:
accuracy, precision, recall = test_model(Classifier(random_state=501, 
                                                    classifier=LogisticRegression(solver='lbfgs', 
                                                                                  max_iter=300), 
                                                    classifier_train_part=0.2), folds)
print("precision:", format(np.mean(precision), '.3f'))
print("recall:", format(np.mean(recall), '.3f'))
print("accuracy:", format(np.mean(accuracy), '.3f'))

precision: 0.821
recall: 0.918
accuracy: 0.816


Как видно, удалось с помощью внутреннего классификатора на признаках на основе генератора получить точность порядка 91%. 

Для улучшения имеет смысл попытаться сделать ансамбль внутренних классификаторов на нескольких разбиениях обучающей выборки, так как сейчас классификатор учится на достаточно малом числе примеров из-за необходимости оставлять обширную выборку для получения признаков