<a href="https://colab.research.google.com/github/dinisrferreira/Pseudodiagnosticity-in-a-continuous-learning-environment-Reis-2020-/blob/main/Data_generator_Reis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class ConditionDataset:
    def __init__(self, environment="wicked", num_samples=1000, prob_no_decision=0.02, random_state=None):
        self.num_samples = num_samples
        self.prob_no_decision = prob_no_decision
        self.num_no_decision_samples = int(self.num_samples * self.prob_no_decision)
        self.data = []
        self.random_state = random_state

        # Set the random state if provided
        if self.random_state is not None:
            random.setstate(self.random_state)

        self._generate_data(environment)

        # Save the random state after data generation
        self.random_state = random.getstate()


    def _generate_data(self, environment):
        initial_choices = ['a', 'b', 'c', 'd']
        samples_per_type = self.num_samples - self.num_no_decision_samples

        if environment == "wicked":
            for _ in range(samples_per_type):
                initial_cell = random.choice(initial_choices)
                values = self._generate_wicked_values(initial_cell)
                input_vector = [0, 0, 0, 0]
                cell_index = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
                input_vector[cell_index[initial_cell]] = values[initial_cell]
                diagnosis = self._calculate_diagnosis(values)
                self.data.append({
                    'input': input_vector,
                    'full_values': [values['a'], values['b'], values['c'], values['d']],
                    'label': diagnosis,
                    'initial_choice': cell_index[initial_cell],
                    'environment': 'wicked'
                })
        elif environment == "kind":
            for _ in range(samples_per_type):
                initial_cell = random.choice(initial_choices)
                values = self._generate_kind_values(initial_cell)
                values = self._adjust_kind_values(initial_cell, values)
                input_vector = [0, 0, 0, 0]
                cell_index = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
                input_vector[cell_index[initial_cell]] = values[initial_cell]
                diagnosis = self._calculate_diagnosis(values)
                self.data.append({
                    'input': input_vector,
                    'full_values': [values['a'], values['b'], values['c'], values['d']],
                    'label': diagnosis,
                    'initial_choice': cell_index[initial_cell],
                    'environment': 'kind'
                })

        # Generate no-decision cases
        for _ in range(self.num_no_decision_samples):
            if random.choice([True, False]):
                a = random.randint(52, 94)
                b = random.randint(10, 43)
                values = {
                    'a': a,
                    'b': b,
                    'c': b,  # Ensuring a = d and b = c
                    'd': a
                }
            else:
                a = random.randint(10, 43)
                b = random.randint(52, 94)
                values = {
                    'a': a,
                    'b': b,
                    'c': b,  # Ensuring a = d and b = c
                    'd': a
                }

            initial_cell = random.choice(initial_choices)
            input_vector = [0, 0, 0, 0]
            cell_index = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
            input_vector[cell_index[initial_cell]] = values[initial_cell]

            # Append the no-decision example to the dataset
            self.data.append({
                'input': input_vector,
                'full_values': [values['a'], values['b'], values['c'], values['d']],
                'label': 0,
                'initial_choice': cell_index[initial_cell],
                'environment': 'no-decision'
            })

    def _generate_wicked_values(self, initial_cell):
        values = {}
        high_value = random.randint(56, 90)
        low_value = random.randint(10, 43)

        if initial_cell in ['a', 'b']:
            values[initial_cell] = high_value if random.choice([True, False]) else low_value
            if initial_cell == 'a':
                values['b'] = 100 - values['a']
                values['c'] = random.randint(56, 90) if values['a'] > 50 else random.randint(10, 43)
                values['d'] = 100 - values['c']
            elif initial_cell == 'b':
                values['a'] = 100 - values['b']
                values['d'] = random.randint(56, 90) if values['b'] > 50 else random.randint(10, 43)
                values['c'] = 100 - values['d']
        else:
            values[initial_cell] = high_value if random.choice([True, False]) else low_value
            if initial_cell == 'c':
                values['d'] = 100 - values['c']
                values['a'] = random.randint(56, 90) if values['c'] > 50 else random.randint(10, 43)
                values['b'] = 100 - values['a']
            elif initial_cell == 'd':
                values['c'] = 100 - values['d']
                values['b'] = random.randint(56, 90) if values['d'] > 50 else random.randint(10, 43)
                values['a'] = 100 - values['b']

        return values


    def _generate_kind_values(self, initial_cell):
            values = {}
            high_value = random.randint(56, 90)
            low_value = random.randint(10, 43)
            values[initial_cell] = high_value if random.choice([True, False]) else low_value
            #Setting same-row-as-first-choice's values randomly
            if initial_cell == 'a':
                while True:
                  values['b'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                  if values['b']!= values['a']: break

                values['c'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                values['d'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
            elif initial_cell == 'b':
                while True:
                  values['a'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                  if values['a']!= values['b']: break

                values['c'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                values['d'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
            elif initial_cell == 'c':
                while True:
                  values['d'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                  if values['d']!= values['c']: break

                values['a'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                values['b'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)

            elif initial_cell == 'd':
                while True:
                  values['c'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                  if values['c']!= values['d']: break

                values['a'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)
                values['b'] = random.randint(10, 43) if random.choice([True, False]) else random.randint(56, 90)

            return values

    def _adjust_kind_values(self, initial_cell, values):
    # Determine the pattern based on the initial values

        if initial_cell in ['a', 'b']:
            if values['a'] > values['b']:

                while True:
                    values['c'] = random.randint(11,90)
                    if values['c']!=50: break

                while True:
                    values['d'] = random.randint(10,values['c']-1)
                    if values['d']!=50: break

            elif values['a'] < values['b']:
                while True:
                    values['d'] = random.randint(11,90)
                    if values['d']!=50: break

                while True:
                    values['c'] = random.randint(10,values['d']-1)
                    if values['c']!=50: break

        elif initial_cell in ['c', 'd']:
            if values['c'] > values['d']:
                while True:
                    values['a'] = random.randint(11,90)
                    if values['a']!=50: break

                while True:
                    values['b'] = random.randint(10,values['a']-1)
                    if values['b']!=50: break

            elif values['c'] < values['d']:

                while True:
                    values['b'] = random.randint(11,90)
                    if values['b']!=50: break

                while True:
                    values['a'] = random.randint(10,values['b']-1)
                    if values['a']!=50: break

        return values



    def _calculate_diagnosis(self, values):
        prob_disease_A = (values['a'] / (values['a'] + values['b'])) * (values['c'] / (values['c'] + values['d']))
        prob_disease_B = (values['b'] / (values['a'] + values['b'])) * (values['d'] / (values['c'] + values['d']))

        if prob_disease_A > prob_disease_B:
            return -1
        elif prob_disease_A < prob_disease_B:
            return 1
        else:
            return 0

    def get_data(self):
        return self.data

    def get_initial_random_state(self):
        return self.random_state


In [None]:
# Create datasets for wicked and kind environments
wicked_dataset = ConditionDataset(environment="wicked", num_samples=1000)
kind_dataset = ConditionDataset(environment="kind", num_samples=1000)
universal_kind_dataset = ConditionDataset(environment="kind", num_samples=1000)


wicked_data = wicked_dataset.get_data()
kind_data = kind_dataset.get_data()
universal_kind_data = universal_kind_dataset.get_data()

In [None]:
# Example usage:
for sample in wicked_data[:5]:  # Display the first 5 samples for verification
    print(sample)
print("\n \n")
# Example usage:
for sample in kind_data[:10]:  # Display the first 5 samples for verification
    print(sample)


{'input': [0, 14, 0, 0], 'full_values': [86, 14, 89, 11], 'label': -1, 'initial_choice': 1, 'environment': 'wicked'}
{'input': [0, 70, 0, 0], 'full_values': [30, 70, 20, 80], 'label': 1, 'initial_choice': 1, 'environment': 'wicked'}
{'input': [0, 0, 87, 0], 'full_values': [79, 21, 87, 13], 'label': -1, 'initial_choice': 2, 'environment': 'wicked'}
{'input': [0, 0, 72, 0], 'full_values': [69, 31, 72, 28], 'label': -1, 'initial_choice': 2, 'environment': 'wicked'}
{'input': [89, 0, 0, 0], 'full_values': [89, 11, 63, 37], 'label': -1, 'initial_choice': 0, 'environment': 'wicked'}

 

{'input': [17, 0, 0, 0], 'full_values': [17, 29, 48, 82], 'label': 1, 'initial_choice': 0, 'environment': 'kind'}
{'input': [0, 0, 0, 11], 'full_values': [81, 79, 89, 11], 'label': -1, 'initial_choice': 3, 'environment': 'kind'}
{'input': [0, 0, 0, 25], 'full_values': [18, 16, 76, 25], 'label': -1, 'initial_choice': 3, 'environment': 'kind'}
{'input': [0, 0, 0, 90], 'full_values': [28, 30, 37, 90], 'label': 1

In [None]:
def count_diagnoses(data):
    diagnosis_counts = Counter([example['label'] for example in data])
    return diagnosis_counts

def count_wicked_examples(data):
    wicked_counts = Counter([example['environment'] for example in data if example['environment'] == 'wicked'])
    return wicked_counts

def count_kind_examples(data):
    kind_counts = Counter([example['environment'] for example in data if example['environment'] == 'kind'])
    return kind_counts

def distribution_of_cell_values(data):
    cell_values = {'a': [], 'b': [], 'c': [], 'd': []}
    for example in data:
        values = example['full_values']
        cell_values['a'].append(values[0])
        cell_values['b'].append(values[1])
        cell_values['c'].append(values[2])
        cell_values['d'].append(values[3])
    return cell_values

def count_initial_choices(data):
    initial_choice_counts = Counter([example['initial_choice'] for example in data])
    return initial_choice_counts

def plot_cell_value_distribution(cell_values):
    for cell, values in cell_values.items():
        sns.histplot(values, bins=20, kde=True)
        plt.title(f'Distribution of Values for Cell {cell.upper()}')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.show()

In [None]:
print("Diagnosis Counts:")
print(count_diagnoses(wicked_dataset.get_data()))

print(count_diagnoses(kind_dataset.get_data()))

print("\n")

print("Wicked Examples:")
print("WICKED DATASET: ",count_wicked_examples(wicked_dataset.get_data()))
print("KIND DATASET: ", count_wicked_examples(kind_dataset.get_data()))

print("\n")

print("Kind Examples:")
print("WICKED DATASET: ",count_kind_examples(wicked_dataset.get_data()))
print("KIND DATASET: ", count_kind_examples(kind_dataset.get_data()))


cell_values = distribution_of_cell_values(kind_data)
#plot_cell_value_distribution(cell_values)

diagnosis_counts = count_diagnoses(kind_data)
initial_choice_counts = count_initial_choices(kind_data)

Diagnosis Counts:
Counter({-1: 505, 1: 475, 0: 20})
Counter({1: 491, -1: 489, 0: 20})


Wicked Examples:
WICKED DATASET:  Counter({'wicked': 980})
KIND DATASET:  Counter()


Kind Examples:
WICKED DATASET:  Counter()
KIND DATASET:  Counter({'kind': 980})


A conversão em percentagens acontece depois do dataset ser criado porque há vantagens em ter operações com inteiros na criação de condições

In [None]:
def convert_percentages(data):
    for item in data:
        # Check if the first value in 'input' is already a float with two decimal places
        if isinstance(item['input'][0], float) and round(item['input'][0], 2) == item['input'][0]:
            continue  # Skip conversion if values are already in the desired format

        # Convert 'input' and 'full_values' lists
        item['input'] = [x / 100.0 for x in item['input']]
        item['full_values'] = [x / 100.0 for x in item['full_values']]
    return data

In [None]:
# Conversão em Percentagens
wicked_data = convert_percentages(wicked_data)
kind_data = convert_percentages(kind_data)
universal_kind_data = convert_percentages(universal_kind_data)

In [None]:
# Step 2: Tidying the data into a table format
wicked_df = pd.DataFrame(wicked_data)
kind_df = pd.DataFrame(kind_data)
universal_kind_df = pd.DataFrame(universal_kind_data)

In [None]:
# Stratified split for wicked dataset
wicked_df['stratify_label'] = wicked_df['label'].apply(lambda x: 1 if x == 0 else 0)
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratified_split.split(wicked_df, wicked_df['stratify_label']):
    wicked_train_set = wicked_df.loc[train_index].drop(columns=['stratify_label'])
    wicked_test_set = wicked_df.loc[test_index].drop(columns=['stratify_label'])

# Stratified split for kind dataset
kind_df['stratify_label'] = kind_df['label'].apply(lambda x: 1 if x == 0 else 0)
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratified_split.split(kind_df, kind_df['stratify_label']):
    kind_train_set = kind_df.loc[train_index].drop(columns=['stratify_label'])
    kind_test_set = kind_df.loc[test_index].drop(columns=['stratify_label'])

# Stratified split for universal kind dataset
universal_kind_df['stratify_label'] = universal_kind_df['label'].apply(lambda x: 1 if x == 0 else 0)
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in stratified_split.split(universal_kind_df, universal_kind_df['stratify_label']):
    universal_kind_train_set = universal_kind_df.loc[train_index].drop(columns=['stratify_label'])
    universal_kind_test_set = universal_kind_df.loc[test_index].drop(columns=['stratify_label'])

# Now you have stratified train and test sets for both wicked and kind datasets
print(f'Wicked train set: {wicked_train_set.shape}')
print(f'Wicked test set: {wicked_test_set.shape}')
print(f'Kind train set: {kind_train_set.shape}')
print(f'Kind test set: {kind_test_set.shape}')
print(f'Universal Kind train set: {universal_kind_train_set.shape}')
print(f'Universal Kind test set: {universal_kind_test_set.shape}')

Wicked train set: (800, 5)
Wicked test set: (200, 5)
Kind train set: (800, 5)
Kind test set: (200, 5)
Universal Kind train set: (800, 5)
Universal Kind test set: (200, 5)


In [None]:
# Check no-decision portions
def check_no_decision_portion(dataframe):
    no_decision_cases = dataframe[dataframe['environment'] == 'no-decision']
    portion_no_decision = len(no_decision_cases) / len(dataframe)
    return portion_no_decision


In [None]:
wicked_train_no_decision_portion = check_no_decision_portion(wicked_train_set)
wicked_test_no_decision_portion = check_no_decision_portion(wicked_test_set)
kind_train_no_decision_portion = check_no_decision_portion(kind_train_set)
kind_test_no_decision_portion = check_no_decision_portion(kind_test_set)
universal_kind_train_no_decision_portion = check_no_decision_portion(universal_kind_train_set)
universal_kind_test_no_decision_portion = check_no_decision_portion(universal_kind_test_set)

In [None]:
wicked_train_set.head()

Unnamed: 0,input,full_values,label,initial_choice,environment
374,"[0.0, 0.0, 0.35, 0.0]","[0.27, 0.73, 0.35, 0.65]",1,2,wicked
197,"[0.72, 0.0, 0.0, 0.0]","[0.72, 0.28, 0.87, 0.13]",-1,0,wicked
81,"[0.1, 0.0, 0.0, 0.0]","[0.1, 0.9, 0.19, 0.81]",1,0,wicked
473,"[0.77, 0.0, 0.0, 0.0]","[0.77, 0.23, 0.88, 0.12]",-1,0,wicked
795,"[0.0, 0.0, 0.89, 0.0]","[0.89, 0.11, 0.89, 0.11]",-1,2,wicked


In [None]:
# Get the current timestamp
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

# Define the folder path
folder_path = '/content/drive/My Drive/ProjetoGulbenkian/'

# Save the dataframes to CSV files with a timestamp
wicked_train_set.to_csv(f'{folder_path}wicked_train_set_{timestamp}.csv', index=False)
wicked_test_set.to_csv(f'{folder_path}wicked_test_set_{timestamp}.csv', index=False)
kind_train_set.to_csv(f'{folder_path}kind_train_set_{timestamp}.csv', index=False)
kind_test_set.to_csv(f'{folder_path}kind_test_set_{timestamp}.csv', index=False)
universal_kind_train_set.to_csv(f'{folder_path}universal_kind_train_set_{timestamp}.csv', index=False)
universal_kind_test_set.to_csv(f'{folder_path}universal_kind_test_set_{timestamp}.csv', index=False)