In [1]:
##################################
#       DATA NORMALIZATION       #
# This means rescaling of input  #
# data to a particular range,    #
# for this example: 0 to 1       #
##################################

from csv import reader

def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax


###################################################################
#### Using the formula: Scaled Value = value - min / max - min ####
###################################################################

def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


In [3]:
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)

print('Loaded data file {0} with {1} rows and {2} columns'
      .format(filename, len(dataset), len(dataset[0])))

# convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print("First Row of actual Dataset: ", dataset[0])

minmax = dataset_minmax(dataset)
print("The minmax array: ", minmax)

normalize_dataset(dataset, minmax)
print("First Row of  Dataset: ", dataset[0])

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns
First Row of actual Dataset:  [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
The minmax array:  [[0.0, 17.0], [0.0, 199.0], [0.0, 122.0], [0.0, 99.0], [0.0, 846.0], [0.0, 67.1], [0.078, 2.42], [21.0, 81.0], [0.0, 1.0]]
First Row of  Dataset:  [0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]


In [14]:
######################################
#       DATA STANDARDIZATION         #
# This means centering the data      #
# on the value 0 and the standad     #
# deviation of 1. Together, these    #
# values help to create a Normal     #
# (Gaussian) distribution            #
######################################

from math import sqrt

# Calculating column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means


# Calculating column standard deviations
def column_stdevs(dataset, means):
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i] - means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(dataset) - 1))) for x in stdevs]
    return stdevs


###################################################################
####        Standardized Value = (value - mean) / stdev        ####
###################################################################

# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]
            
            

means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
print("Means: ", means)

print("Standard Deviation: ", stdevs)

# Standardize Dataset
standardize_dataset(dataset, means, stdevs)
print("Standardized Dataset with Means and Standard Deviation: ", dataset[0])

Means:  [0.2261795343137268, 0.6075102072864326, 0.5664382684426226, 0.207438973063974, 0.09432562549251379, 0.47678953986587147, 0.16817946288784516, 0.2040147569444446, 0.3489583333333333]
Standard Deviation:  [0.19821047427640448, 0.16066642309113685, 0.15865415713643255, 0.16113351078512722, 0.13622222500158185, 0.11749866349292773, 0.1414725000054547, 0.19600385901131126, 0.4769513772427971]
Standardized Dataset with Means and Standard Deviation:  [0.6395304921176485, 0.8477713205896689, 0.14954329852954532, 0.9066790623472494, -0.6924393247241297, 0.20387990726747174, 0.4681868702297959, 1.4250667195933597, 1.3650063669598067]
