# Standardizing data sets
- binning
- dummies
- check value counts

## Import modules and create/import data

In [50]:
import pandas
import numpy
import random

In [59]:
data = pandas.DataFrame({"student_age":[10, 13 , 12, 13, 14,
                                        12, 12, 13, 15, 16],
                         
                         "student_height":[140, 145, 144, 150, 162, 
                                           123, 146, 152, 139, 138],
                         
                         "student_name":["george", "sally", "lisa", "julie", "david",
                                        "tyson", "harry", "gregory", "max", "sarah"],
                         
                         "student_score":[.54, .25, .76, .89, .98,
                                          .76, .65, .45, .49, .23],
                         
                         "student_fav_colour":["Red","Red","Blue","Blue","Blue",
                                  "Purple","Purple","Red","Yellow","Yellow"],
                         
                         "student_gender": ["male","male","male","female","female",
                                    "female","male","male","female","male"],
                         
                         "student_outcome":[1,0,0,1,0,
                                           1,1,1,0,1],
                         
                         "student_breakfast":["y","n","y","y","n",
                                             "y","y","n","y","y"]})

In [60]:
data.head(3)

Unnamed: 0,student_age,student_height,student_name,student_score,student_fav_colour,student_gender,student_outcome,student_breakfast
0,10,140,george,0.54,Red,male,1,y
1,13,145,sally,0.25,Red,male,0,n
2,12,144,lisa,0.76,Blue,male,0,y


## Create

In [None]:
for column in data.columns:
    count_unique = data[column].nunique()
    print(column, count_unique)

### Converting Continuous Variables to Categorical Variables via Binning

In [62]:
continuous_column_names = ["student_age", "student_height", "student_score"]
number_bins = 3

for name in continuous_column_names:
    data = pandas.concat(
        [
            data,
            pandas.cut(data[name], 
            bins=number_bins).rename("binned_" + name)
        ],
        axis=1)

data.drop(continuous_column_names, axis=1, inplace=True)

In [63]:
data

Unnamed: 0,student_name,student_fav_colour,student_gender,student_outcome,student_breakfast,binned_student_age,binned_student_height,binned_student_score
0,george,Red,male,1,y,"(9.994, 12.0]","(136.0, 149.0]","(0.48, 0.73]"
1,sally,Red,male,0,n,"(12.0, 14.0]","(136.0, 149.0]","(0.229, 0.48]"
2,lisa,Blue,male,0,y,"(9.994, 12.0]","(136.0, 149.0]","(0.73, 0.98]"
3,julie,Blue,female,1,y,"(12.0, 14.0]","(149.0, 162.0]","(0.73, 0.98]"
4,david,Blue,female,0,n,"(12.0, 14.0]","(149.0, 162.0]","(0.73, 0.98]"
5,tyson,Purple,female,1,y,"(9.994, 12.0]","(122.961, 136.0]","(0.73, 0.98]"
6,harry,Purple,male,1,y,"(9.994, 12.0]","(136.0, 149.0]","(0.48, 0.73]"
7,gregory,Red,male,1,n,"(12.0, 14.0]","(149.0, 162.0]","(0.229, 0.48]"
8,max,Yellow,female,0,y,"(14.0, 16.0]","(136.0, 149.0]","(0.48, 0.73]"
9,sarah,Yellow,male,1,y,"(14.0, 16.0]","(136.0, 149.0]","(0.229, 0.48]"


### Converting Categorical Columns to Dummy Dataframes

In [64]:
category_column_names = ["student_fav_colour", "student_gender", "student_breakfast",
                        "binned_student_age", "binned_student_height", "binned_student_score"]

for name in category_column_names:
    current_dummy_frame = pandas.get_dummies(
        data[name], 
        prefix=name, 
        drop_first=True)
    
    data = pandas.concat(
        [data, 
         current_dummy_frame], axis=1)

data.drop(category_column_names, axis=1, inplace=True)

In [65]:
data

Unnamed: 0,student_name,student_outcome,student_fav_colour_Purple,student_fav_colour_Red,student_fav_colour_Yellow,student_gender_male,student_breakfast_y,"binned_student_age_(12.0, 14.0]","binned_student_age_(14.0, 16.0]","binned_student_height_(136.0, 149.0]","binned_student_height_(149.0, 162.0]","binned_student_score_(0.48, 0.73]","binned_student_score_(0.73, 0.98]"
0,george,1,0,1,0,1,1,0,0,1,0,1,0
1,sally,0,0,1,0,1,0,1,0,1,0,0,0
2,lisa,0,0,0,0,1,1,0,0,1,0,0,1
3,julie,1,0,0,0,0,1,1,0,0,1,0,1
4,david,0,0,0,0,0,0,1,0,0,1,0,1
5,tyson,1,1,0,0,0,1,0,0,0,0,0,1
6,harry,1,1,0,0,1,1,0,0,1,0,1,0
7,gregory,1,0,1,0,1,0,1,0,0,1,0,0
8,max,0,0,0,1,0,1,0,1,1,0,1,0
9,sarah,1,0,0,1,1,1,0,1,1,0,0,0


In [66]:
data.drop("student_name", axis = 1, inplace = True)

In [67]:
data

Unnamed: 0,student_outcome,student_fav_colour_Purple,student_fav_colour_Red,student_fav_colour_Yellow,student_gender_male,student_breakfast_y,"binned_student_age_(12.0, 14.0]","binned_student_age_(14.0, 16.0]","binned_student_height_(136.0, 149.0]","binned_student_height_(149.0, 162.0]","binned_student_score_(0.48, 0.73]","binned_student_score_(0.73, 0.98]"
0,1,0,1,0,1,1,0,0,1,0,1,0
1,0,0,1,0,1,0,1,0,1,0,0,0
2,0,0,0,0,1,1,0,0,1,0,0,1
3,1,0,0,0,0,1,1,0,0,1,0,1
4,0,0,0,0,0,0,1,0,0,1,0,1
5,1,1,0,0,0,1,0,0,0,0,0,1
6,1,1,0,0,1,1,0,0,1,0,1,0
7,1,0,1,0,1,0,1,0,0,1,0,0
8,0,0,0,1,0,1,0,1,1,0,1,0
9,1,0,0,1,1,1,0,1,1,0,0,0


### Checking value counts after processing

In [74]:
for column in data.columns:
    count_unique = data[column].nunique()
    print(column, count_unique,sep="_____________________")

student_outcome_____________________2
student_fav_colour_Purple_____________________2
student_fav_colour_Red_____________________2
student_fav_colour_Yellow_____________________2
student_gender_male_____________________2
student_breakfast_y_____________________2
binned_student_age_(12.0, 14.0]_____________________2
binned_student_age_(14.0, 16.0]_____________________2
binned_student_height_(136.0, 149.0]_____________________2
binned_student_height_(149.0, 162.0]_____________________2
binned_student_score_(0.48, 0.73]_____________________2
binned_student_score_(0.73, 0.98]_____________________2
