In [3]:
# library 😚

import sklearn.datasets as datasets
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

from sklearn.externals.six import StringIO  
# from IPython.display import Image  
from IPython.display import Image, display
from sklearn.tree import export_graphviz
# import pydotplus

from scipy.io.arff import loadarff

import numpy as np
from sklearn import metrics

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import collections


In [4]:
# config 😚
path_to_datasets = '/home/farzad/Desktop/semiWithTree/originDataset/'
dataset_name = 'bupa'


dataset_path  = path_to_datasets + dataset_name
base_classifier = DecisionTreeClassifier
random_state = 0
min_samples_leaf=3

In [1337]:
def divide_xy(test_data):
    # assert : class = last atr 😚
    x_test = test_data.values[:, 0:-1]
    y_test = (test_data.values[:, -1]).astype('int')
    
    return x_test,y_test

def read_data(dataset_path) :
    
    train_raw_data = loadarff(dataset_path+'/train.arff')
    test_raw_data = loadarff(dataset_path+'/test.arff')
    
    train_data = pd.DataFrame(train_raw_data[0])
    test_data = pd.DataFrame(test_raw_data[0])
    
    train_data['Class'] = train_data['Class'].astype(int)
    test_data['Class'] = test_data['Class'].astype(int)
    
    return train_data,test_data


def get_rate_p(train_y) : 
    
    counter=collections.Counter(train_y)
    tuple_list_pn = counter.most_common()
    
    return tuple_list_pn[0][1]/(tuple_list_pn[0][1]+tuple_list_pn[1][1]) , tuple_list_pn

def split_trainset(train_data) :
    
    labeled , unlabeled = [],[]
    
    size_dataset = len(train_data)
    train_x,train_y = divide_xy(train_data)
    
    rate_p , tuple_list_pn = get_rate_p(train_y)
    
    size_labeled_data = round(0.1 * size_dataset)
    size_unlabeled_data = size_dataset - size_labeled_data
    
    size_labeled_p_data = round(rate_p*size_labeled_data)
    size_labeled_n_data = size_labeled_data - size_labeled_p_data
    
    labeled_index = []
    unlabeled_index = []
    selected_pl = 0
    selected_nl = 0
    
    for i,cls in enumerate(train_y):
        # if data point class's == 0 😚
        if cls == tuple_list_pn[0][0] :
            if selected_pl < size_labeled_p_data :
                labeled_index.append(i)
                selected_pl+=1
            else :
                unlabeled_index.append(i)
        else :
            if selected_nl < size_labeled_n_data :
                labeled_index.append(i)
                selected_nl+=1
            else :
                unlabeled_index.append(i)
                
    for i in labeled_index:
        labeled.append(train_data.values[i])
    
    for i in unlabeled_index:
        unlabeled.append(train_data.values[i])
    
#     print(size_dataset , size_labeled_data , size_unlabeled_data)
#     print(rate_p , tuple_list_pn)
#     print(size_labeled_p_data , size_labeled_n_data)
#     print(selected_pl/(selected_pl+selected_nl),selected_pl, selected_nl)
    
    
    return pd.DataFrame(labeled,columns=train_data.columns),pd.DataFrame(unlabeled,columns=train_data.columns),rate_p,tuple_list_pn

def evaluate_classifier(base_classifier, labeled_data, test_data):
    
    labeled_x,labeled_y = divide_xy(labeled_data)
    test_x,test_y = divide_xy(test_data)

    # dtree = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    dtree=base_classifier(random_state = random_state, min_samples_leaf=min_samples_leaf)
    dtree.fit(labeled_x,labeled_y)
    
    dot_data = StringIO()
    export_graphviz(dtree, out_file=dot_data,filled=True, rounded=True,special_characters=True)
#     graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#     img = Image(graph.create_png())
    img=None
    y_pred = dtree.predict(test_x)
    
    accuracy = metrics.accuracy_score(test_y, y_pred)
    
    return accuracy , img


In [1341]:
from nonconformist.base import ClassifierAdapter
from nonconformist.cp import TcpClassifier
from nonconformist.nc import ClassifierNc, MarginErrFunc

def confidency(name , DTclassifier , labeled_data , unlabeled_data , i , confidence) :

    lbl = None
    test_x,test_y = divide_xy(unlabeled_data)
    train_x,train_y = divide_xy(labeled_data)
    is_confident = False
    if name == 'Probability' :
        lbl = DTclassifier.predict([test_x[i]])
        i_confidence = DTclassifier.predict_proba([test_x[i]])
        if max(i_confidence[0]) > confidence :
            is_confident = True
            
            
    elif name == 'tcp' :
        model = ClassifierAdapter(base_classifier(random_state = random_state, min_samples_leaf=min_samples_leaf))
        nc = ClassifierNc(model, MarginErrFunc())
        tcp = TcpClassifier(nc)
        tcp.fit(train_x,train_y)
        
        prediction_conf = tcp.predict_conf(test_x[[i], :])
        lbl = [prediction_conf[0][0]]
        
        
        ss=0
        me=0
        for i in range(100) :
            prediction = tcp.predict(test_x[[i], :])
            me += abs(prediction[0][0]-prediction[0][1]) * max(prediction[0][0],prediction[0][1])

            prediction = tcp.predict_conf(test_x[[i], :])
            ss += prediction[0][1]*prediction[0][2]
        print(ss-me , end=" , ")
        
        if ss-me > confidence :
            is_confident = True
        
    return is_confident , lbl


def selection_metric(labeled_data,unlabeled_data ,rate_p,tuple_list_pn , confidence,selection_rate , confidence_method_name) :
    
    labeled_x,labeled_y = divide_xy(labeled_data)
    unlabeled_x,unlabeled_y = divide_xy(unlabeled_data)

    # dtree = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    DTclassifier = base_classifier(random_state = random_state, min_samples_leaf=min_samples_leaf)
    DTclassifier.fit(labeled_x,labeled_y)
    
    removed_selected_data = unlabeled_data.copy()
    total_selected_labeling = pd.DataFrame(columns=labeled_data.columns)
    selected_labeling = pd.DataFrame(columns=labeled_data.columns)
    selected_index = []
    selected_y = []
    
    
    for i  in range(len(removed_selected_data)) :
        is_confident , lbl = confidency(confidence_method_name , DTclassifier , labeled_data ,
                                        removed_selected_data , i , confidence)
        if is_confident: 
            selected_index.append(i)
            selected_y.append(lbl[0])
            # set class
            removed_selected_data.at[i, 'Class'] = lbl[0]
            
    
    selected_index_p = []
    selected_index_n = []
    
    
    size_selected  = round(selection_rate * len(labeled_data))
    print(' PISH FARZ  size_selected : ', size_selected)
    
    #should be constant rate 
    new_rate_p,new_tuple_list_pn = get_rate_p(np.array(selected_y))
        
    len_new_selected_p = new_tuple_list_pn[0][1]
    len_new_selected_n = new_tuple_list_pn[1][1]
    
    len_lebeled_p = tuple_list_pn[0][1]
    len_lebeled_n = tuple_list_pn[1][1]

    size_select_p = 0
    size_select_n = 0
    
    
    print('rate_p:',rate_p , '  new_rate_p:',new_rate_p)
    print('tuple_list_pn:',tuple_list_pn , '  new_tuple_list_pn:',new_tuple_list_pn)
    
    if new_rate_p > rate_p :
        size_select_n = round(min(len_new_selected_n , size_selected * (1-rate_p)))
        size_select_p = round(size_select_n * (rate_p/(1-rate_p)))
        size_selected = size_select_n + size_select_p
            
    else :
        size_select_p = round(min(len_new_selected_p , size_selected * rate_p))
        size_select_n = round(size_select_p * ((1-rate_p)/rate_p))
        size_selected = size_select_n + size_select_p

            
        
    print('size_select_p : ' , size_select_p , '   size_select_n : ' , size_select_n, '   size_selected : ' , size_selected)
    
    
    p = new_tuple_list_pn[0][0]
    
    i=0
    while(size_select_p > 0):
        if selected_y[i] == p :
            selected_index_p.append(i)      
            size_select_p-=1
        i+=1
                
    i=0
    while(size_select_n > 0):
        if selected_y[i] != p :
            selected_index_n.append(i)  
            size_select_n-=1
        i+=1
        
    print('selected_index_p : ',len(selected_index_p))
    print('selected_index_n : ',len(selected_index_n))
    
    
    for i in range(len(selected_index_p)):
        selected_labeling=selected_labeling.append(removed_selected_data.iloc[selected_index_p[i]] ,ignore_index=True)
    print('selected_labeling_p: ', len(selected_labeling))
        
    for i in range(len(selected_index_n)):
        selected_labeling=selected_labeling.append(removed_selected_data.iloc[selected_index_n[i]] ,ignore_index=True)
    print('selected_labeling_n: ', len(selected_labeling))

    removed_selected_data.drop(removed_selected_data.index[selected_index])
    
    total_selected_labeling = pd.concat([labeled_data ,selected_labeling],ignore_index=True)
    
    return total_selected_labeling,removed_selected_data


def self_labeling(labeled_data , unlabeled_data , iteration , rate_p,tuple_list_pn , confidence,selection_rate,confidence_method_name):

    
    labeled_unlabel_data = labeled_data.copy()
    removed_selected_data = unlabeled_data.copy()
    
    while iteration:
        
        selected_labeling,removed_selected_data = selection_metric(labeled_unlabel_data,removed_selected_data,
                                                                   rate_p,tuple_list_pn ,
                                                                   confidence,selection_rate,
                                                                   confidence_method_name)
        labeled_unlabel_data = pd.concat([labeled_unlabel_data , selected_labeling])
        
        print('iteration:' , iteration , ' , selected_labeling:' , len(selected_labeling)
             , ' , labeled_data:' , len(labeled_data))
        iteration-=1
        
    return labeled_unlabel_data
        
        

In [1342]:

train_data,test_data = read_data(dataset_path)
train_x,train_y = divide_xy(train_data)
test_x , test_y = divide_xy(test_data)

labeled_data,unlabeled_data , rate_p,tuple_list_pn = split_trainset(train_data)






a1 , img1 = evaluate_classifier(base_classifier , labeled_data  , test_data)

total_labeled_data = self_labeling(labeled_data , unlabeled_data , 1, 
                                   rate_p,tuple_list_pn ,
                                   confidence=10,selection_rate = 1,confidence_method_name='tcp')

a2 , img2 = evaluate_classifier(base_classifier , total_labeled_data  , test_data)


# display(img1)
# display(img2)
# print(a1 , a2)


16.759595445975734 , 19.15962016099298 , 18.47870384329148 , 15.130972764232087 , 16.19457130354224 , 16.99218130938598 , 19.612514411276763 , 16.068450490767347 , 17.305954024164073 , 15.487665992969845 , 17.829226109148912 , 17.987574681212084 , 17.35996995863728 , 14.880577951999573 , 18.315887312238097 , 19.41950562947762 , 19.305547432857843 , 17.795967628185835 , 18.351986928117668 , 16.800088269654108 , 13.94363499397836 , 16.60642637099109 , 19.426793685509605 , 14.701116807399629 , 19.426496817756625 , 15.677736861094377 , 17.82007020212993 , 17.276754836631902 , 16.181567482720546 , 17.115172510348604 , 21.070422468895732 , 17.429223328473334 , 16.919093442304977 , 18.026155240474452 , 16.813121741094246 , 17.074049287780362 , 17.1459477803467 , 17.80994865288608 , 21.584498386955794 , 15.005894494649436 , 17.083072004950914 , 18.774798031992848 , 14.753496211935115 , 14.611500895847833 , 15.949660890085482 , 15.91258671502785 , 17.370570637653934 , 18.540862694129338 , 17.56

In [None]:
from nonconformist.cp import TcpClassifier
from nonconformist.nc import NcFactory


model = base_classifier(random_state = random_state, min_samples_leaf=min_samples_leaf)	# Create the underlying model
nc = NcFactory.create_nc(model)	# Create a default nonconformity function
tcp = TcpClassifier(nc)			# Create a transductive conformal classifier

tcp.fit(train_x,train_y)
prediction = tcp.predict(test_x[[1], :], significance=0.9)
prediction = tcp.predict_conf(test_x[[1], :])

prediction

# test_y[1]

In [1332]:
from nonconformist.base import ClassifierAdapter
from nonconformist.cp import TcpClassifier
from nonconformist.nc import ClassifierNc, MarginErrFunc

model = ClassifierAdapter(base_classifier(random_state = random_state, min_samples_leaf=min_samples_leaf))
nc = ClassifierNc(model, MarginErrFunc())
tcp = TcpClassifier(nc)
tcp.fit(train_x,train_y)

sss = 2

print()
ss=0
me=0
for i in range(100) :
    prediction = tcp.predict(test_x[[sss], :])
    me += abs(prediction[0][0]-prediction[0][1]) * max(prediction[0][0],prediction[0][1])
    
    prediction = tcp.predict_conf(test_x[[sss], :])
    ss += prediction[0][1]*prediction[0][2]
print(ss,me ,ss-me, test_y[sss])
# 


60.273926880379086 57.8819599395923 2.391966940786787 1
