# The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2018 Semester 1
-----
## Project 1: What is labelled data worth to Naive Bayes?
-----
###### Student Name(s): Daniel Masters (583334)
###### Python version: 3
-----
## N.B. 

- I implemented the functions with three datasets in mind: *breast-cancer*, *mushroom*, *hypothyroid*. You can test each of them by modifying the filename variable at the start of the *preprocess* functions. 

- You can test the hold-out functionality by modifying the *hold_out_percent* variable in the preprocess_supervised function.

- I have also attached the 2 individual iPython notebooks where you can run the programs as iPython scripts, if you wish.

In [None]:
# This function should open a data file in csv, and transform it into a usable format 
def preprocess_supervised():
    import pandas as pd
    import math
    from collections import defaultdict

    #Convert CSV to dataframe; slice into two dataframes based on class

    # Select from breast-cancer, mushroom, hypothyroid. Enter hold-out percentage.
    filename = 'breast-cancer-dos.csv'
    df = pd.read_csv(filename, header=None)
    hold_out_percent = 0

    num_rows = len(df)
    df_len = len(df.columns) - 1
    classes_list = df[df_len].unique()
    num_class = len(classes_list)
    hold_outs_index = math.ceil(len(df) - hold_out_percent*num_rows)
    hold_outs = df.iloc[hold_outs_index:len(df),:]
    df = df.iloc[0:hold_outs_index,:]
    num_instances = len(df)
    class1 = df[df[df_len] == str(classes_list[0])]
    class2 = df[df[df_len] == str(classes_list[1])]
    del class1[df_len]
    del class2[df_len]
    return

In [14]:
# This function should build a supervised NB model
def train_supervised():
    
    #Generate attribute instance frequencies for both classes
    class1_freq = [dict() for x in class1]
    class2_freq = [dict() for x in class2]

    for i in class1:
        class1_freq[i] = defaultdict(int)
        for key in class1[i]:
            class1_freq[i][key] += 1

    for i in class2:
        class2_freq[i] = defaultdict(int)
        for key in class2[i]:
            class2_freq[i][key] += 1

    #Generate posterior probabilities and apply log-transformation - i.e. log(P(Xi|Cj))
    class1_posterior = [dict() for x in class1_freq]
    class2_posterior = [dict() for x in class2_freq]

    for i in class1:
        total_class1 = sum(class1_freq[i].values(), 0.0)
        class1_posterior[i][i] = {k: math.log10(v/total_class1) for k, v in class1_freq[i].items()}

    for i in class2:
        total_class2 = sum(class2_freq[i].values(), 0.0)
        class2_posterior[i][i]  = {k: math.log10(v/total_class2) for k, v in class2_freq[i].items()}

    #Generate class probabilities and apply log-transformation - i.e. log(P(Cj))
    class1_prob = math.log10(total_class1/num_instances)
    class2_prob = math.log10(total_class2/num_instances)
    return

In [15]:
# This function should predict the class for a set of instances, based on a trained model 
def predict_supervised():
    class1_predicted = defaultdict(int)
    class2_predicted = defaultdict(int)
    correct = 0

    for i in range(num_rows-hold_outs_index, num_instances):
        for j in range(len(class1.columns)):
            try:
                class2_predicted[i] += class2_posterior[j][j][df[j][i]]
                class1_predicted[i] += class1_posterior[j][j][df[j][i]]

            #Allow for any attributes that are present in only one class dictionary
            except KeyError:
                class2_predicted[i] += 0
                class1_predicted[i] += 0
        class2_predicted[i] += class2_prob
        class1_predicted[i] += class1_prob
    return

In [16]:
# This function should evaluate a set of predictions, in a supervised context 
def evaluate_supervised():
    for i in range(num_rows-hold_outs_index, num_instances):
        if(class1_predicted[i] >= class2_predicted[i]):
            if(df[df_len][i]==str(classes_list[0])):
                correct += 1
        elif(class1_predicted[i] < class2_predicted[i]):
            if(df[df_len][i]==str(classes_list[1])):
                correct += 1
            
    print("Accuracy: " + str(round(correct*100/num_instances, 2)) + "%")
    return

In [13]:
# This function should open a data file in csv, and transform it into a usable format 
def preprocess_unsupervised():     
    import pandas as pd
    import math
    import random
    from collections import defaultdict
    
    # Convert CSV to dataframe
    
    # Select from breast-cancer, mushroom, hypothyroid
    filename = 'breast-cancer-dos.csv'
    df = pd.read_csv(filename, header=None)

    class1 = class2 = 0
    df_len = len(df.columns) - 1
    classes_list = df[df_len].unique()
    num_classes = len(classes_list)
    num_instances = len(df)
    attributes_len = df_len - num_classes
    labelled_data = df[df_len].copy(deep=True)
    
    return

In [17]:
# This function should build an unsupervised NB model 
def train_unsupervised():
    for i in range(num_instances): 
        df.at[i, df_len-1] = random.uniform(0, 1)
        df.at[i, df_len] = 1 - df[df_len-1][i]
        class1 += df.at[i, df_len-1] 
        class2 += df.at[i, df_len]

    # Generate attribute instance frequencies for both classes
    reccurence_freq = [dict() for x in range(attributes_len)]
    no_reccurence_freq = [dict() for x in range(attributes_len)]

    for i in range(attributes_len):
        reccurence_freq[i] = defaultdict(int)
        for j in range(num_instances):
            reccurence_freq[i][df[i][j]] += df[df_len-1][j]

    for i in range(attributes_len):
        no_reccurence_freq[i] = defaultdict(int)
        for j in range(num_instances):
            no_reccurence_freq[i][df[i][j]] += df[df_len][j]

    for i in range(attributes_len):
        for key in reccurence_freq[i]:
            reccurence_freq[i][key] = reccurence_freq[i][key]/class1

    for i in range(attributes_len):
        for key in no_reccurence_freq[i]:
            no_reccurence_freq[i][key] = no_reccurence_freq[i][key]/class2
    
    return

In [18]:
# This function should predict the class distribution for a set of instances, based on a trained model
def predict_unsupervised():

# N.B. I realise this is not the most efficient algorithm -- I was having trouble getting it to work since I'm not very familiar
# with Python, so I designed the algorithm to replicate each step of the process as a "visualised" sanity check of sorts.

    prediction = [defaultdict(int) for x in range(num_instances)]
    prediction2 = [defaultdict(int) for x in range(num_instances)]
    num_iterations = 4
    true_positive = true_negative = false_negative = false_positive = 0

    for iteration in range(num_iterations):
        if iteration == 0:
            for i in range(num_instances):
                for j in range(attributes_len):
                    prediction[i][df[j][i]] = reccurence_freq[j][df[j][i]]
                    prediction[i]['temp'] = 1

            for i in range(num_instances):
                for j in range(attributes_len):
                    prediction2[i][df[j][i]] = no_reccurence_freq[j][df[j][i]]
                    prediction2[i]['temp'] = 1

        for i in range(num_instances):
            for j in range(attributes_len):
                prediction[i]['temp'] *= prediction[i][df[j][i]]
            prediction[i]['pred'] = 1
            prediction[i]['pred'] *= class1/num_instances
            prediction[i]['pred'] *= prediction[i]['temp']

        for i in range(num_instances):
            for j in range(attributes_len):
                prediction2[i]['temp'] *= prediction2[i][df[j][i]]
            prediction2[i]['pred'] = 1
            prediction2[i]['pred'] *= class2/num_instances
            prediction2[i]['pred'] *= prediction2[i]['temp']

        for i in range(num_instances):
            prediction[i]['newclass'] = prediction[i]['pred']/(prediction[i]['pred']+prediction2[i]['pred'])
            prediction2[i]['newclass'] = prediction2[i]['pred']/(prediction[i]['pred']+prediction2[i]['pred'])
    return

In [19]:
# This function should evaluate a set of predictions, in an unsupervised manner
def evaluate_unsupervised():
    for i in range(num_instances):        
        if(prediction[i]['newclass'] >= prediction2[i]['newclass']):
            if(labelled_data[i]==str(classes_list[0])):
                true_positive += 1
            else:
                false_positive += 1
        elif(prediction[i]['newclass'] < prediction2[i]['newclass']):
            if(labelled_data[i]==str(classes_list[1])):
                true_negative += 1
            else:
                false_negative += 1

    print("Accuracy: " + str(round((true_positive+true_negative)*100/num_instances, 2)) + "%\n")

    print("CONFUSION MATRIX\nTrue Positive:  " + str(true_positive) + "     True Negative: " + str(true_negative) 
          + "\nFalse Negative: " + str(false_negative) + "     False Positive: " + str(false_positive))
    
    return

### Question 1

Each of the datasets have numerous similar or duplicated instances, which means the posterior probabilities are high for many of the attributes and means the data is less noisy. In turn, this causes highly skewed predictions, resulting in the mentioned variation in accuracy ("pretty good" vs "utter fail"). Of particular note, I observed relatively high accuracy from the hypothyroid dataset, which has very similar instances throughout. 

Further evidence supporting this hypothesis: when accuracy has been below 50%, the confusion matrix has consistently shown a skew towards false positives, rather than a relatively equal spread between false positives and false negatives.

### Question 3

I observed a reduction in accuracy for each dataset when implementing a hold-out evaluation strategy. This is expected behaviour, since evaluating the model using the training data is essentially just checking how well it can recall data, rather than its ability to predict unknown data. Each dataset exhibited relatively similar reductions, depending on the hold-out percentage:

|          | Hypothyroid | Breast Cancer | Mushroom |
|----------|-------------|---------------|----------|
| Original | 93.17%      | 73.43%        | 84.91%   |
| 10%      | 86.83%      | 67.55%        | 75.30%   |
| 20%      | 73.29%      | 55.02%        | 64.18%   |
| 50%      | 0.06%       | 0.00%         | 0.00%    |

Surprisingly, a 50% hold-out resulted in a steep decline in accuracy. Otherwise, the differences are as expected, and largely similar across the dataset:

|     | Hypothyroid | Breast Cancer | Mushroom |
|-----|-------------|---------------|----------|
| 10% | -6.34%       | -5.88%         | -9.61%    |
| 20% | -19.88%      | -18.41%        | -20.73%   |
| 50% | -93.11%      | -73.43%        | -84.91%   |
