In [461]:
# importing some necessary libraries
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from importlib import reload
from sklearn.model_selection import train_test_split
import random

For my FCA analysis, I use tic-tac-toe dataset. It consists of 9 variables and 1 target value. I already have train and test dataframes

In [462]:
df_train = pd.read_csv('train1.csv'.format(10))
df_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive


In [463]:
df_train["V10"]

0      positive
1      positive
2      positive
3      positive
4      positive
         ...   
860    negative
861    negative
862    negative
863    negative
864    negative
Name: V10, Length: 865, dtype: object

In [464]:
df_test = pd.read_csv('test1.csv'.format(10))
df_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,o,x,o,positive
1,x,x,x,x,o,b,o,b,o,positive
2,x,x,x,o,o,x,o,x,o,positive
3,x,x,x,o,o,b,x,o,b,positive
4,x,x,x,b,o,b,o,o,x,positive


In [465]:
df_train.shape

(865, 10)

First, we need to scale out data. I want to replace x in columns with 1 and o with 0.

In [466]:
df_train = df_train.applymap(lambda el: 1 if el in ["x", "positive"] else 0)
df_test = df_test.applymap(lambda el: 1 if el in ["x", "positive"] else 0)

In [467]:
df_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,1,1,1,1,0,0,1,0,0,1
1,1,1,1,1,0,0,0,0,1,1
2,1,1,1,1,0,0,0,0,0,1
3,1,1,1,1,0,0,0,0,0,1
4,1,1,1,1,0,0,0,0,0,1


In [468]:
df_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,1,1,1,1,0,0,0,1,0,1
1,1,1,1,1,0,0,0,0,0,1
2,1,1,1,0,0,1,0,1,0,1
3,1,1,1,0,0,0,1,0,0,1
4,1,1,1,0,0,0,0,0,1,1


## FCA

In [469]:
def lazy_fca(df_train, df_test):
     
    df_train = df_train.to_numpy()
    df_test = df_test.to_numpy()
    
# our metrics
    tp = 0
    fp = 0    
    tn = 0
    fn = 0
    
    tp_list = []
    fp_list = []
    tn_list = []
    fn_list = []

    
# spliting data into two parts: C+ - context with "+" examples, C- - context with "-" examples:

    plus_context_train = df_train[df_train[:, -1] == 1][:, :-1] 
    minus_context_train = df_train[df_train[:, -1] == 0][:, :-1]
    plus_context_test = df_test[df_test[:, -1] == 1][:, :-1] 
    minus_context_test = df_test[df_test[:, -1] == 0][:, :-1]

# intersections of train and test data  
    for j in plus_context_test:
        if sum(map(lambda i: (j * i).sum() / plus_context_train.shape[0], plus_context_train)) > sum(map(lambda i: (j * i).sum() / minus_context_train.shape[0], minus_context_train)):
            tp += 1
        else:
            fp += 1
            
    tp_list.append(tp)
    fp_list.append(fp)
    
    
    for j in minus_context_test:
        if sum(map(lambda i: (j * i).sum() / minus_context_train.shape[0], minus_context_train)) > sum(map(lambda i: (j * i).sum() / plus_context_train.shape[0], plus_context_train)):
            tn += 1
        else:
            fn += 1
# appending results to our metrics

    tn_list.append(tn)
    fn_list.append(fn)
    
    TP = sum(tp_list)
    FP = sum(fp_list)
    TN = sum(tn_list)
    FN = sum(fn_list)

# metrics calculations, according to their formulas

    print (
        "True positive:", TP ,"\n" 
        "True Negative:", TN,"\n"
        "False Positive:", FP,"\n"
        "False Negative:", FN,"\n"
        "True Positive Rate:", TP/(TP + FN),"\n"
        "True Negative Rate:", TN/(TN + FP),"\n"
        "Negative Predictive Value:", TN/(TN + FN),"\n"
        "False Positive Rate:",FP/(FP + TN),"\n"
        "False Discovery Rate:",FP/(TP + FP),"\n"
        "Accuracy:" ,(TP+TN)/(TP + FP + TN + FN),"\n"
        "Precision:" ,TP/(TP + FP),"\n"
        "Recall:" ,TP/(TP + FN))

In [470]:
lazy_fca(df_train, df_test)

True positive: 47 
True Negative: 15 
False Positive: 14 
False Negative: 17 
True Positive Rate: 0.734375 
True Negative Rate: 0.5172413793103449 
Negative Predictive Value: 0.46875 
False Positive Rate: 0.4827586206896552 
False Discovery Rate: 0.22950819672131148 
Accuracy: 0.6666666666666666 
Precision: 0.7704918032786885 
Recall: 0.734375


## Comparing with KNN algorithm (state of the art algorithm)

In [473]:
from sklearn.neighbors import KNeighborsClassifier
def knn_alg(df_train, df_test):
    
    clf = KNeighborsClassifier(n_neighbors=5)
    clf = clf.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
     
    df_train = df_train.to_numpy()
    df_test = df_test.to_numpy()
    
# our metrics
    tp = 0
    fp = 0    
    tn = 0
    fn = 0
    
    tp_list = []
    fp_list = []
    tn_list = []
    fn_list = []
    


    
# spliting data into two parts: C+ - context with "+" examples, C- - context with "-" examples:

    plus_context_train = df_train[df_train[:, -1] == 1][:, :-1] 
    minus_context_train = df_train[df_train[:, -1] == 0][:, :-1]
    plus_context_test = df_test[df_test[:, -1] == 1][:, :-1] 
    minus_context_test = df_test[df_test[:, -1] == 0][:, :-1]
    
    TP= int(clf.predict(plus_context_test).sum())
    FN=(1 - clf.predict(plus_context_test)).sum()
    TN=(1 - clf.predict(minus_context_test)).sum()
    FP= int(clf.predict(minus_context_test).sum())
    
    print (
        "True positive:", TP ,"\n" 
        "True Negative:", TN,"\n"
        "False Positive:", FP,"\n"
        "False Negative:", FN,"\n"
        "True Positive Rate:", TP/(TP + FN),"\n"
        "True Negative Rate:", TN/(TN + FP),"\n"
        "Negative Predictive Value:", TN/(TN + FN),"\n"
        "False Positive Rate:",FP/(FP + TN),"\n"
        "False Discovery Rate:",FP/(TP + FP),"\n"
        "Accuracy:" ,(TP+TN)/(TP + FP + TN + FN),"\n"
        "Precision:" ,TP/(TP + FP),"\n"
        "Recall:" ,TP/(TP + FN))

In [474]:
knn_alg(df_train, df_test)

True positive: 60 
True Negative: 26 
False Positive: 6 
False Negative: 1 
True Positive Rate: 0.9836065573770492 
True Negative Rate: 0.8125 
Negative Predictive Value: 0.9629629629629629 
False Positive Rate: 0.1875 
False Discovery Rate: 0.09090909090909091 
Accuracy: 0.9247311827956989 
Precision: 0.9090909090909091 
Recall: 0.9836065573770492


As we see k-nearest neighbors algorithm showed much better results