In [4]:
import random

In [5]:
def print_data(data):
    for sample in data:
        print (sample)

## Missing Values

In [6]:
def generate_missing_data(num_features: int, num_samples: int, prob_missing=0.1) -> list[list[int]]:
    data = []
    for sample in range(num_samples):
        sample_data = []
        for feat in range(num_features):
            if random.random() < prob_missing:
                sample_data.append(None)
            else:
                sample_data.append(random.randint(0,9) * 2 ** feat)
        data.append(sample_data)
    return data

Omission

In [10]:
def omit_missing(data):
    new_data = []
    for sample in data:
        if None not in sample:
            new_data.append(sample)
    return new_data

In [11]:
random.seed(42196)
data = generate_missing_data(4, 10)
print_data(data)
print ("")

data = omit_missing(data)
print_data(data)

[6, None, 36, 16]
[3, 4, 8, 8]
[5, 16, 28, 56]
[9, 2, 24, 72]
[6, None, 16, 0]
[8, 6, 24, 48]
[2, 2, 4, None]
[7, 8, 12, 72]
[9, 2, 20, None]
[3, 16, 16, 8]

[3, 4, 8, 8]
[5, 16, 28, 56]
[9, 2, 24, 72]
[8, 6, 24, 48]
[7, 8, 12, 72]
[3, 16, 16, 8]


Interpolate

In [16]:
# replace missing data with feature median
def interpolate_missing(data):
    # find median for each feature
    medians = []
    for feat in range(len(data[0])):
        vals = []
        for sample in range(len(data)):
            if data[sample][feat] is not None:
                vals.append(data[sample][feat])
        vals.sort()
        medians.append(vals[len(vals)//2])
    
    # replace missing values with medians
    for sample in range(len(data)):
        for feat in range(len(data[sample])):
            if data[sample][feat] is None:
                data[sample][feat] = medians[feat]
    
    return data

In [23]:
random.seed(42196)
data = generate_missing_data(4, 10)
print_data(data)
print ("")

data = interpolate_missing(data)
print_data(data)

[6, None, 36, 16]
[3, 4, 8, 8]
[5, 16, 28, 56]
[9, 2, 24, 72]
[6, None, 16, 0]
[8, 6, 24, 48]
[2, 2, 4, None]
[7, 8, 12, 72]
[9, 2, 20, None]
[3, 16, 16, 8]

[6, 6, 36, 16]
[3, 4, 8, 8]
[5, 16, 28, 56]
[9, 2, 24, 72]
[6, 6, 16, 0]
[8, 6, 24, 48]
[2, 2, 4, 48]
[7, 8, 12, 72]
[9, 2, 20, 48]
[3, 16, 16, 8]


## Outliers

In [18]:
from scipy.stats import norm

In [19]:
# using mile time as an example, mean for 22-26 male: 8:45, std. dev. estimate: 1:00
def generate_normal_data(num_samples: int) -> list[int]:
    mean = 525
    std_dev = 60
    return [int(mean + std_dev * norm.ppf(random.random())) for _ in range(num_samples)]

In [24]:
def print_mile_times(data):
    output = ""
    for sample in data:
        output += f"{sample//60}:{sample%60:02} "
    print (output)

In [27]:
def remove_outliers(data: list[int]) -> tuple[list[int], list[int]]:
    # compute population mean, std dev
    mean = sum(data) / len(data)
    std_dev = (sum([(x - mean) ** 2 for x in data]) / len(data)) ** 0.5
    
    # remove +/- 3 st dev
    new_data = []
    outliers = []
    for sample in data:
        z_score = (sample - mean) / std_dev
        if z_score > 3 or z_score < -3:
            outliers.append(sample)
        else:
            new_data.append(sample)
            
    return new_data, outliers

In [28]:
random.seed(42196)

data = generate_normal_data(100)
print_mile_times(data)
print ("")

data, outliers = remove_outliers(data)
print ("Data:")
print_mile_times(data)
print ("Outliers:")
print_mile_times(outliers)

8:00 10:32 6:26 8:05 10:57 9:29 7:40 8:34 8:50 9:26 7:47 7:21 9:51 9:48 9:51 9:57 9:46 8:48 7:28 8:51 8:41 10:05 9:11 7:25 8:44 8:27 7:55 8:56 7:51 7:26 7:45 8:06 8:02 9:02 9:23 8:07 9:39 7:53 7:33 8:45 8:34 11:01 9:29 9:33 9:28 9:49 7:28 9:28 7:33 10:38 7:50 8:39 9:47 8:00 8:02 7:49 9:22 8:57 8:05 7:47 8:20 8:07 10:01 7:58 10:53 9:04 11:59 8:27 9:25 7:40 8:40 7:19 8:05 9:02 8:05 7:54 8:36 9:29 10:02 7:56 9:52 9:22 9:40 8:34 8:04 8:34 9:06 8:57 9:30 9:20 6:04 7:32 9:21 6:37 9:07 8:46 10:34 6:43 8:22 8:33 

Data:
8:00 10:32 6:26 8:05 10:57 9:29 7:40 8:34 8:50 9:26 7:47 7:21 9:51 9:48 9:51 9:57 9:46 8:48 7:28 8:51 8:41 10:05 9:11 7:25 8:44 8:27 7:55 8:56 7:51 7:26 7:45 8:06 8:02 9:02 9:23 8:07 9:39 7:53 7:33 8:45 8:34 11:01 9:29 9:33 9:28 9:49 7:28 9:28 7:33 10:38 7:50 8:39 9:47 8:00 8:02 7:49 9:22 8:57 8:05 7:47 8:20 8:07 10:01 7:58 10:53 9:04 8:27 9:25 7:40 8:40 7:19 8:05 9:02 8:05 7:54 8:36 9:29 10:02 7:56 9:52 9:22 9:40 8:34 8:04 8:34 9:06 8:57 9:30 9:20 6:04 7:32 9:21 6:37 9:07 8:46

## Feature Enrichment

In [51]:
# need to think of an example here...
# pull dob, job type from id?

## Transformations

In [33]:
# imagine answers to a 5-question multiple choice exam (A, B, C, D)
# but filled out in a badly implemented google form (so you have responses 'A', 'a', 'a.')
def generate_bad_data() -> list[str]:
        return [
            "a b a c",
            "A A C D",
            "a. b. c. d.",
            "a\nb\nd\nc",
            "A a b. A",
            "a,c,c,d"
        ]

In [49]:
def transform_data(data: list[str]) -> list[list[str]]:
    new_data = []
    separators = [" ", "\n", ".", ","]
    for answer in data:
        sample = answer.lower()
        for separator in separators:
            if separator in sample:
                new_sample = ""
                for res in sample.split(separator):
                    new_sample += res
                sample = new_sample
        new_data.append(list(sample))
    return new_data

In [50]:
data = generate_bad_data()
print (generate_bad_data())
print ("")
print (transform_data(data))

['a b a c', 'A A C D', 'a. b. c. d.', 'a\nb\nd\nc', 'A a b. A', 'a,c,c,d']

[['a', 'b', 'a', 'c'], ['a', 'a', 'c', 'd'], ['a', 'b', 'c', 'd'], ['a', 'b', 'd', 'c'], ['a', 'a', 'b', 'a'], ['a', 'c', 'c', 'd']]
