In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import KFold  
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings("ignore")  
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV


df = pd.read_csv('wdbc.csv', header = None)


output = df.iloc[:,0]


for i in np.arange(0,df.shape[0]):
    if output.iloc[i] == 'B':
        output.iloc[i] = 0
    else:
        output.iloc[i] = 1



features = df.iloc[:,1:]




features_normal = normalize(features)


features_normal_df = pd.DataFrame(features_normal)


norm_whole = pd.concat([features_normal_df,output],axis=1)


col_head = norm_whole.columns


norm_whole.columns= ['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23','X24','X25','X26','X27','X28','X29','X30','y']


norm_whole.sort_values(by=['y'],inplace=True)


norm_whole.reset_index(drop=True,inplace=True)


X_y_0 = norm_whole.iloc[0:357,:]
X_y_1 = norm_whole.iloc[357:569,:]
X_y_1.reset_index(drop=True,inplace=True)


C_selected = pd.DataFrame(index=np.arange(1,31),columns=['Selected Penalty'])



Precision_Train = pd.DataFrame(index=np.arange(1,31),columns=['Precision Train'])

Recall_Train = pd.DataFrame(index=np.arange(1,31),columns=['Recall Train'])

F1_Train = pd.DataFrame(index=np.arange(1,31),columns=['F1 Train'])

Accuracy_Train = pd.DataFrame(index=np.arange(1,31),columns=['Accuracy Train'])

AUC_Train = pd.DataFrame(index=np.arange(1,31),columns=['AUC Train'])

Train_Error = pd.DataFrame(index=np.arange(1,31),columns=['Train Error for best selected C'])





Precision_Test = pd.DataFrame(index=np.arange(1,31),columns=['Precision Test'])

Recall_Test = pd.DataFrame(index=np.arange(1,31),columns=['Recall Test'])

F1_Test = pd.DataFrame(index=np.arange(1,31),columns=['F1 Test'])

Accuracy_Test = pd.DataFrame(index=np.arange(1,31),columns=['Accuracy Test'])

AUC_Test = pd.DataFrame(index=np.arange(1,31),columns=['AUC Test'])

Test_Error = pd.DataFrame(index=np.arange(1,31),columns=['Test Error for best selected C'])

# Test_Error_Estimated does not have meaning here (as defined previously)
# Explanation a bit lengthy but since min_C selected over labeled train, which once selected is used to estimate unlabeled train
# the estimated test error is the estimate over the labeled train (for CV over the labeled train)

# to get the test error estimate CV (5-fold/K-Fold) over final obtained X_train and y_train using selected min_C must be done
# this for each monte-carlo iteration becomes computationally intensive, and thus hasn't been performed

# additionally the most ideal way of selecting C would be cross-validating C over both labeled and unlabeled train
# the question in HW is segmented, also this is computationally intensive
# thus hasn't been done

# in this method we'd obtain final X_train and y_train from labeled and unlabeled train for each C
# than for that X_train, y_train and C for which that X_train and y_train was obtained, we'd CV again internally to obtain
# the test error estimate
# Thus there would be two cross validation loop, first to pick C and second to pick C corresponding to the lowest
# test error estimate
# Note all this would be inside a monte-carlo loop
# Also the while loop, other than the two for loops (for CV), for generating X_train and y_train, can't be neglected
# This will enormously increase the computation time
# Additionally there may be other methods/ways also to CV over penalty in the current case
# We've picked the least computationally intensive segmented approach as deliberated in the HW

# CONCLUSION: Test Error Estimation, may be performed, differently than performed in Supervised learning, either in a sense
# absolutely true to selection of C or in an approximate, segmented sense, both requiring much longer computational times
# than the computational time already required for the Monte-Carlo simulation, in general, and thus the minimal approach has been 
# utilized here



for mc in np.arange(0,30):
    
    print(mc)  # to keep tally of MC iterations

    X_y_train_0,X_y_test_0 = train_test_split(X_y_0,test_size = 0.2,shuffle=True)
    X_y_train_1,X_y_test_1 = train_test_split(X_y_1,test_size = 0.2,shuffle=True)

    X_y_train_0_l, X_y_train_0_u = train_test_split(X_y_train_0,test_size = 0.5, shuffle=True) 
    X_y_train_1_l, X_y_train_1_u = train_test_split(X_y_train_1,test_size = 0.5, shuffle=True)


    X_y_train = pd.concat([X_y_train_0_l,X_y_train_1_l],axis=0)
    X_y_train_u = pd.concat([X_y_train_0_u,X_y_train_1_u],axis=0)
    X_y_test = pd.concat([X_y_test_0,X_y_test_1],axis=0)




    X_y_train = shuffle(X_y_train)
    X_y_train_u = shuffle(X_y_train_u)
    X_y_test = shuffle(X_y_test)



    X_y_train.reset_index(drop=True,inplace=True)
    X_y_train_u.reset_index(drop=True,inplace=True)
    X_y_test.reset_index(drop=True,inplace=True)


    X_train = X_y_train.drop(columns=['y'])
    X_train_u = X_y_train_u.drop(columns=['y'])
    X_test = X_y_test.drop(columns=['y'])
    y_train = X_y_train['y'].astype(int)
    y_train_u = X_y_train_u['y'].astype(int)
    y_test =  X_y_test['y'].astype(int)







    kf = KFold(n_splits=5,shuffle=True)

    C = np.array([10**(-1),10**(0),10**(1),10**(2),10**(3),10**(4),10**(5),10**(6),10**(7)]) 

    error_df = pd.DataFrame(index = C, columns = ['Estimated Test Errors'])

    for i in np.arange(0,C.size):

        cv_ite = -1
        cv_error_vector = np.zeros(5)  

        for train,test in kf.split(X_train):

            cv_ite = cv_ite + 1  
            X_train_cv,X_test_cv = X_train.iloc[train,:],X_train.iloc[test,:]
            y_train_cv,y_test_cv = y_train[train],y_train[test]
            clf = LinearSVC(penalty='l1',dual=False,C=C[i]).fit(X_train_cv,y_train_cv)
            y_test_cv_pred = clf.predict(X_test_cv)

            mis = 0
            for l in np.arange(0,X_test_cv.shape[0]):
                if y_test_cv_pred[l] != y_test_cv.iloc[l]:
                    mis = mis + 1

            percent_error = (mis/X_test_cv.shape[0])*100
            cv_error_vector[cv_ite] = percent_error


        error_df.iloc[i,0] = np.mean(cv_error_vector)








    min_error = np.min(error_df.to_numpy().flatten())
    min_C = C[np.argmin(error_df.to_numpy().flatten())]
    
    C_selected.iloc[mc,0] = min_C


    




    X_train_O = X_train.copy()
    y_train_O = y_train.copy()
    X_train_u_O = X_train_u.copy()
    y_train_u_O = y_train_u.copy()






    unlabel_length = X_train_u.shape[0]


    while unlabel_length > 0:

        clf_u = LinearSVC(penalty='l1',dual=False,C=min_C).fit(X_train,y_train)   
        y_train_u_pred = clf_u.predict(X_train_u)
        X_train_u_decision_function = clf_u.decision_function(X_train_u)    
        X_train_u_distance = abs(X_train_u_decision_function)               
        unlab_samp_selec_indx = np.argmax(X_train_u_distance)              
        unlab_samp_selec = X_train_u.iloc[unlab_samp_selec_indx,:].to_numpy().reshape(1,X_train_u.shape[1])
        unlab_samp_selec_df = pd.DataFrame(unlab_samp_selec, columns=X_train_u.columns)
        y_unlab_samp_selec = y_train_u_pred[unlab_samp_selec_indx] 
        X_train = pd.concat([X_train,unlab_samp_selec_df],axis=0)
        X_train.reset_index(drop=True,inplace=True)       
        y_train[y_train.size] = y_unlab_samp_selec        
        indx_interim = np.arange(0,X_train_u.shape[0])
        indx_selected = np.delete(indx_interim,unlab_samp_selec_indx)

        X_train_u = X_train_u.iloc[indx_selected,:]
        X_train_u.reset_index(drop=True,inplace=True)

        y_train_u = y_train_u.iloc[indx_selected]          
        y_train_u.reset_index(drop=True,inplace=True)      

        unlabel_length = X_train_u.shape[0]







    clf_final = LinearSVC(penalty='l1',dual=False,C=min_C).fit(X_train,y_train)
    y_test_pred = clf_final.predict(X_test)

    mis = 0
    for l in np.arange(0,X_test.shape[0]):
        if y_test_pred[l] != y_test.iloc[l]:
            mis = mis + 1

    test_error = (mis/X_test.shape[0])*100
    
    Test_Error.iloc[mc,0] = test_error
    





    y_train_pred = clf_final.predict(X_train)

    mis = 0
    for l in np.arange(0,X_train.shape[0]):
        if y_train_pred[l] != y_train.iloc[l]:
            mis = mis + 1

    train_error = (mis/X_train.shape[0])*100
    
    Train_Error.iloc[mc,0] = train_error
   








    confusion_matrix_train = confusion_matrix(y_train,y_train_pred)

    




    confusion_matrix_train_df = pd.DataFrame(confusion_matrix_train,index=['Actually 0','Actually 1'],columns=['Predicted 0','Predicted 1'])
   




    precision_train = (confusion_matrix_train[1][1]/(confusion_matrix_train[1][1]+confusion_matrix_train[0][1]))*100
    recall_train = (confusion_matrix_train[1][1]/(confusion_matrix_train[1][1]+confusion_matrix_train[1][0]))*100

    Precision_Train.iloc[mc,0] = precision_train
    Recall_Train.iloc[mc,0] = recall_train





    accuracy_train = ((confusion_matrix_train[0][0]+confusion_matrix_train[1][1])/(confusion_matrix_train[0][0]+confusion_matrix_train[1][1]+confusion_matrix_train[0][1]+confusion_matrix_train[1][0]))*100
    f1_train = ((2*(precision_train/100)*(recall_train/100))/((precision_train/100)+(recall_train/100))) # divided by 100 as precision and recall specified in percentage
    
    Accuracy_Train.iloc[mc,0] = accuracy_train
    F1_Train.iloc[mc,0] = f1_train




    confusion_matrix_test = confusion_matrix(y_test,y_test_pred)

    



    confusion_matrix_test_df = pd.DataFrame(confusion_matrix_test,index=['Actually 0','Actually 1'],columns=['Predicted 0','Predicted 1'])
    




    precision_test = (confusion_matrix_test[1][1]/(confusion_matrix_test[1][1]+confusion_matrix_test[0][1]))*100
    recall_test = (confusion_matrix_test[1][1]/(confusion_matrix_test[1][1]+confusion_matrix_test[1][0]))*100

    Precision_Test.iloc[mc,0] = precision_test
    Recall_Test.iloc[mc,0] = recall_test
    
    


    accuracy_test = ((confusion_matrix_test[0][0]+confusion_matrix_test[1][1])/(confusion_matrix_test[0][0]+confusion_matrix_test[1][1]+confusion_matrix_test[0][1]+confusion_matrix_test[1][0]))*100
    f1_test = ((2*(precision_test/100)*(recall_test/100))/((precision_test/100)+(recall_test/100))) # divided by 100 as precision and recall specified in percentage
    
    Accuracy_Test.iloc[mc,0] = accuracy_test
    F1_Test.iloc[mc,0] = f1_test




    clf_prob = LinearSVC(penalty='l1',dual=False,C=min_C)
    calibrated_svc = CalibratedClassifierCV(clf_prob,method='sigmoid')
    calibrated_svc.fit(X_train,y_train)
    y_train_pred_prob = calibrated_svc.predict_proba(X_train)

    





    y_train_pred_prob_0 = y_train_pred_prob[:,0]
    




    thresh = np.arange(0,1.01,0.01)
    ROC_df = pd.DataFrame(index=thresh,columns=['FPR','TPR'])
    for i in np.arange(0,thresh.size):
        y_train_pred_roc = (y_train_pred_prob_0 <= thresh[i]).astype(int)
        confusion_matrix_train_roc = confusion_matrix(y_train,y_train_pred_roc)
        ROC_df.iloc[i,0] = confusion_matrix_train_roc[0][1]/(confusion_matrix_train_roc[0][1]+confusion_matrix_train_roc[0][0])
        ROC_df.iloc[i,1] = confusion_matrix_train_roc[1][1]/(confusion_matrix_train_roc[1][1]+confusion_matrix_train_roc[1][0])
   











    AUC_train = metrics.auc(ROC_df['FPR'],ROC_df['TPR'])
    
    AUC_Train.iloc[mc,0] = AUC_train 


    y_test_pred_prob = calibrated_svc.predict_proba(X_test)
    y_test_pred_prob_0 = y_test_pred_prob[:,0]
   


    thresh = np.arange(0,1.01,0.01)
    ROC_df_test = pd.DataFrame(index=thresh,columns=['FPR','TPR'])
    for i in np.arange(0,thresh.size):
        y_test_pred_roc = (y_test_pred_prob_0 <= thresh[i]).astype(int)
        confusion_matrix_test_roc = confusion_matrix(y_test,y_test_pred_roc)
        ROC_df_test.iloc[i,0] = confusion_matrix_test_roc[0][1]/(confusion_matrix_test_roc[0][1]+confusion_matrix_test_roc[0][0])
        ROC_df_test.iloc[i,1] = confusion_matrix_test_roc[1][1]/(confusion_matrix_test_roc[1][1]+confusion_matrix_test_roc[1][0])
   






    AUC_test = metrics.auc(ROC_df_test['FPR'],ROC_df_test['TPR'])
    
    AUC_Test.iloc[mc,0] = AUC_test
    
    
    
print('The best C, penalty, selected for each Monte Carlo iterations is :\n',C_selected)
    
print('The Precision for Train for the Monte Carlo iterations (for best C) is : \n',Precision_Train)
print('The Recall for Train for the Monte Carlo iterations (for best C) is : \n',Recall_Train)
print('The Accuracy for Train for the Monte Carlo iterations (for best C) is : \n',Accuracy_Train)
print('The F1 for Train for the Monte Carlo iterations (for best C) is : \n',F1_Train)
print('The training error for the Monte Carlo iterations (for best C) is : \n',Train_Error)
print('The AUC for Train for the Monte Carlo iterations (for best C) is : \n',AUC_Train)



print('The Precision for Test for the Monte Carlo iterations (for best C) is : \n',Precision_Test)
print('The Recall for Test for the Monte Carlo iterations (for best C) is : \n',Recall_Test)
print('The Accuracy for Test for the Monte Carlo iterations (for best C) is : \n',Accuracy_Test)
print('The F1 for Test for the Monte Carlo iterations (for best C) is : \n',F1_Test)
print('The test error for the Monte Carlo iterations (for best C) is : \n',Test_Error)
print('The AUC for Test for the Monte Carlo iterations (for best C) is : \n',AUC_Test)

# Test error estimation not done due to computational limitations and run time constraints. See upper comment explanations
# for detailed description

    




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
The best C, penalty, selected for each Monte Carlo iterations is :
    Selected Penalty
1            100000
2             1e+06
3             1e+07
4             1e+07
5             10000
6             10000
7             1e+07
8            100000
9             1e+06
10           100000
11            1e+06
12           100000
13             1000
14            1e+06
15            10000
16            1e+06
17            10000
18           100000
19            10000
20            1e+07
21            1e+06
22             1000
23            10000
24            10000
25            10000
26            10000
27           100000
28            1e+06
29           100000
30            1e+06
The Precision for Train for the Monte Carlo iterations (for best C) is : 
    Precision Train
1              100
2              100
3              100
4              100
5          98.8571
6          99.4083
7              100
8    

In [2]:
ovrl_precision_train = Precision_Train.mean(axis=0)
ovrl_recall_train = Recall_Train.mean(axis=0)
ovrl_accuracy_train = Accuracy_Train.mean(axis=0)
ovrl_F1_train = F1_Train.mean(axis=0)
ovrl_auc_train = AUC_Train.mean(axis=0)
ovrl_train_error = Train_Error.mean(axis=0)


ovrl_precision_test = Precision_Test.mean(axis=0)
ovrl_recall_test = Recall_Test.mean(axis=0)
ovrl_accuracy_test = Accuracy_Test.mean(axis=0)
ovrl_F1_test = F1_Test.mean(axis=0)
ovrl_auc_test = AUC_Test.mean(axis=0)
ovrl_test_error = Test_Error.mean(axis=0)



ovrl_C_selected = C_selected.mode(axis=0)


print('FOR TRAIN \n')

print('\nThe overall train precision is : \n',pd.DataFrame(ovrl_precision_train).iloc[0,0],'%')
print('\nThe overall train recall is : \n',pd.DataFrame(ovrl_recall_train).iloc[0,0],'%')
print('\nThe overall train accuracy is : \n',pd.DataFrame(ovrl_accuracy_train).iloc[0,0],'%')
print('\nThe overall train F1 is : \n',pd.DataFrame(ovrl_F1_train).iloc[0,0])
print('\nThe overall train AUC is : \n',pd.DataFrame(ovrl_auc_train).iloc[0,0])
print('\nThe overall train error is : \n',pd.DataFrame(ovrl_train_error).iloc[0,0],'%')

print('\nFOR TEST \n')

print('\nThe overall test precision is : \n',pd.DataFrame(ovrl_precision_test).iloc[0,0],'%')
print('\nThe overall test recall is : \n',pd.DataFrame(ovrl_recall_test).iloc[0,0],'%')
print('\nThe overall test accuracy is : \n',pd.DataFrame(ovrl_accuracy_test).iloc[0,0],'%')
print('\nThe overall test F1 is : \n',pd.DataFrame(ovrl_F1_test).iloc[0,0])
print('\nThe overall test AUC is : \n',pd.DataFrame(ovrl_auc_test).iloc[0,0])
print('\nThe overall test error is : \n',pd.DataFrame(ovrl_test_error).iloc[0,0],'%')



print('\nThe overall value of penalty selected is : \n',pd.DataFrame(ovrl_C_selected).iloc[0,0])


print('\nNOTE : All this has been averaged over 30 Monte-Carlo Trials, with train and test selected randomly \n')
print('\nNOTE : For Penalty, instead of average, mode has been used as measure of central tendency over Monte Carlo iterations \n')
print('\nNOTE : All the above values are, obviously, averaged over the best selected C in each case (in each Monte Carlo iteration) \n')
print('\nNOTE : All values except C (penalty), F1 Score, AUC are in %')


FOR TRAIN 


The overall train precision is : 
 99.70514050987488 %

The overall train recall is : 
 99.3680658684955 %

The overall train accuracy is : 
 99.65491923641707 %

The overall train F1 is : 
 0.9953562318215182

The overall train AUC is : 
 0.9997362293288456

The overall train error is : 
 0.3450807635829662 %

FOR TEST 


The overall test precision is : 
 94.28707751196056 %

The overall test recall is : 
 93.10077519379844 %

The overall test accuracy is : 
 95.24637681159422 %

The overall test F1 is : 
 0.9361251191855166

The overall test AUC is : 
 0.9879252799310939

The overall test error is : 
 4.753623188405797 %

The overall value of penalty selected is : 
 10000.0

NOTE : All this has been averaged over 30 Monte-Carlo Trials, with train and test selected randomly 


NOTE : For Penalty, instead of average, mode has been used as measure of central tendency over Monte Carlo iterations 


NOTE : All the above values are, obviously, averaged over the best selected C

For test error estimation not done see explanations in comments

Rest all results for Semi-Supervised learning as seen above in the relevant dataframes and also, subsequently, averaged

Conclusion for all methods, once, all have been run