# Encoding Schemes

### Data Generation

In [1]:
import random
from scipy.stats import norm

In [2]:
# list of pet types, cost at pet store
def generate_data_pet_prices(num_samples: int) -> list[tuple[str,int]]:
    '''
    '''
    pets = ["dog", "cat", "fish", "hamster"]
    price_scales = [100, 50, 5, 10]
    data = []
    for _ in range(num_samples):
        pet_idx = random.randint(0,len(pets)-1)
        pet = pets[pet_idx]
        price = random.randint(1,10) * price_scales[pet_idx]
        data.append((pet, price))
    return data

In [3]:
# test grades
def generate_data_test_scores(num_samples: int) -> list[int]:
    '''
    '''
    mean = 0.76
    std_dev = 0.13
    
    scores = []
    for _ in range(num_samples):
        # normally distributed scores
        score = mean + norm.ppf(random.random()) * std_dev
        
        # can't be over 100, under 0
        if score > 1.:
            score = 1.
        if score < 0.:
            score = 0.
        
        # round 
        score = float(int(score * 100)) / 100.
        
        scores.append(score)
    
    return scores

## One hot

Data we want to encode:

In [4]:
def one_hot_encode(data: tuple) -> tuple[tuple[int]]:
    '''
    @param data Assumes you will input a tuple of only the features you want encoded.
    @return A tuple of encodings.
    '''
    # get all possible values
    vals = set()
    for sample in data:
        if sample not in vals:
            vals.add(sample)
    vals = list(vals)
    
    encoded_data = []
    # generate encodings
    for sample in data:
        encoding = []
        for val in vals:
            encoding.append(1 if sample == val else 0)
        encoded_data.append(encoding)
    return encoded_data

In [5]:
random.seed(42196)

data = tuple(generate_data_pet_prices(10))
encoding = one_hot_encode(tuple([sample[0] for sample in data]))
encoded_data = [tuple(list(encoding[i]) + [data[i][1]]) for i in range(len(data))]

print (data)
print ("")
print (encoded_data)

(('cat', 300), ('hamster', 10), ('fish', 50), ('cat', 350), ('cat', 450), ('cat', 100), ('cat', 400), ('dog', 400), ('fish', 45), ('dog', 800))

[(0, 0, 1, 0, 300), (0, 1, 0, 0, 10), (0, 0, 0, 1, 50), (0, 0, 1, 0, 350), (0, 0, 1, 0, 450), (0, 0, 1, 0, 100), (0, 0, 1, 0, 400), (1, 0, 0, 0, 400), (0, 0, 0, 1, 45), (1, 0, 0, 0, 800)]


## Binarization

In [6]:
def binarize(data: list, criteria: callable) -> list[int]:
    '''
    '''
    encoded_data = []
    for sample in data:
        encoded_data.append(1 if criteria(sample) else 0)
    return encoded_data

In [7]:
# passing is 60%, return pass/fail
random.seed(42196)
data = generate_data_test_scores(100)
encoded_data = binarize(data, lambda x: x >= 0.6)

print (data)
print ("")
print (encoded_data)

[0.66, 0.99, 0.45, 0.67, 1.0, 0.85, 0.62, 0.73, 0.77, 0.85, 0.63, 0.57, 0.9, 0.89, 0.9, 0.91, 0.89, 0.76, 0.59, 0.77, 0.75, 0.93, 0.81, 0.58, 0.75, 0.72, 0.65, 0.78, 0.64, 0.59, 0.63, 0.67, 0.66, 0.79, 0.84, 0.67, 0.87, 0.64, 0.6, 0.76, 0.73, 1.0, 0.85, 0.86, 0.85, 0.9, 0.59, 0.85, 0.6, 1.0, 0.64, 0.74, 0.89, 0.66, 0.66, 0.63, 0.84, 0.78, 0.67, 0.63, 0.7, 0.67, 0.92, 0.65, 1.0, 0.8, 1.0, 0.72, 0.84, 0.62, 0.74, 0.57, 0.67, 0.79, 0.67, 0.65, 0.74, 0.85, 0.92, 0.65, 0.9, 0.84, 0.87, 0.73, 0.67, 0.73, 0.8, 0.78, 0.85, 0.83, 0.41, 0.6, 0.83, 0.48, 0.8, 0.76, 0.99, 0.49, 0.71, 0.73]

[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1]


## Discretization

In [8]:
def discretize(data: list, criteria: callable) -> list:
    '''
    '''
    discretized_data = []
    for sample in data:
        discretized_data.append(criteria(sample))
    return discretized_data

In [9]:
# break grades into A, B, C, D, F
def grades(grade: int) -> str:
    if grade >= 0.9:
        return "A"
    elif grade >= 0.8:
        return "B"
    elif grade >= 0.7:
        return "C"
    elif grade >= 0.6:
        return "D"
    else:
        return "F"
    
random.seed(42196)
data = generate_data_test_scores(10)
discretized_data = discretize(data, grades)

print (data)
print ("")
print (discretized_data)

[0.66, 0.99, 0.45, 0.67, 1.0, 0.85, 0.62, 0.73, 0.77, 0.85]

['D', 'A', 'F', 'D', 'A', 'B', 'D', 'C', 'C', 'B']


In [10]:
# because there's an ordinal relationship (arguably... larger bucket for F kind of throws things off)
# we can put in a more usable format
def ordinal_grades(grade: int) -> int:
    val = int((grade - 0.5) * 10)
    return val if val >= 0 and val <= 4 else (0 if val < 0 else 4)

random.seed(42196)
data = generate_data_test_scores(10)
discretized_data = discretize(data, ordinal_grades)

print (data)
print ("")
print (discretized_data)

[0.66, 0.99, 0.45, 0.67, 1.0, 0.85, 0.62, 0.73, 0.77, 0.85]

[1, 4, 0, 1, 4, 3, 1, 2, 2, 3]
