In [39]:
# Alyssa Cox
# Machine Learning HW3: Pre-processing of Data

import math
import pandas as pd
from HW3_explore import get_df, get_mean, get_max_and_min

def find_nans(df):
    col_nans = []
    headers = df.dtypes.index
    for row in df.itertuples():
            for i in range(len(row)):
                if math.isnan((row[i])):
                    col_name = headers[i - 1]
                    if col_name in col_nans:
                        continue
                    else:
                        col_nans.append(col_name)
    return col_nans

def fill_values(df, column, fill_val="mean"):
    '''
    Fills in NaN cell values with either the mean from that column or a pre-determined value

    Inputs:
            df: a pandas data frame
            column: column to fill in nan values
            fill_val: default is mean value of column, but can be specified by user for
            categorical or binary vars

    Returns: pandas data frame with no missing values for the column specified
    '''

    if fill_val == "mean":
        mean = get_mean(df, column)
        df[column].fillna(mean, inplace=True)
    else:
        df[column].fillna(fill_val, inplace=True)

    return df

def discretize(df, column, num_buckets):
    '''
    Turns a continuous variable into a discrete one by separating
    continuous values into intervals

    Inputs:
            df: a pandas data frame
            column: specific column to discretize (must have numerical data)
            num_buckets: the number of intervals to separate the data into
            

    Returns: pandas data frame where the value in the specified column is the maximum
    value in the interval into which the original data fell.
        ex: values range from 0 - 100, user specifies 10 buckets/intervals, [0-10],[11-20], etc.
            a row with the value 15 will fit into interval [11-20] and will thus be replaced
            with the number 20.
    '''
    index = df.columns.get_loc(column) + 1
    min = get_max_and_min(df, column)[0]
    max = get_max_and_min(df, column)[1]
    partition = round((max - min) / num_buckets)
    for i in range(num_buckets):
        for row in df.itertuples():
            if row[index] > partition * (i) and row[index] < partition * (i + 1):
                range_part = partition * (i + 1)
                df.set_value(row[0], column, range_part)
    return df

def make_dummies(df, column_list):
    '''
       Turns a categorical variable into a binary/dummy variable.

       Inputs:
               df: a pandas dataframe
               columns: the columns to make binary, as a list

       Returns: dataframe with columns added that correspond to the dummy vars and values
       '''

    dummies_df = pd.get_dummies(df, columns=column_list)
    return dummies_df


In [40]:
df = get_df("credit-data.csv")
#Here's our dataframe

In [41]:
#Time to fill in NaNs
#First find what columns have NaNs
find_nans(df)

['MonthlyIncome', 'NumberOfDependents']

In [42]:
#It makes sense to fill the NaNs in MonthlyIncome with the mean, since it's a continuous variable
new_df = fill_values(df, "MonthlyIncome")

In [43]:
#If someone didn't fill in the information for number of dependents, it probably means they don't have dependents. 
filled_df = fill_values(new_df, "NumberOfDependents", 0)
filled_df

Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,60644,2,0.802982,9120.00,13,0,6,0,2.0
1,2,0,0.957151,40,60637,0,0.121876,2600.00,4,0,0,0,1.0
2,3,0,0.658180,38,60601,1,0.085113,3042.00,2,1,0,0,0.0
3,4,0,0.233810,30,60601,0,0.036050,3300.00,5,0,0,0,0.0
4,5,0,0.907239,49,60625,1,0.024926,63588.00,7,0,1,0,0.0
5,6,0,0.213179,74,60629,0,0.375607,3500.00,3,0,1,0,1.0
6,7,0,0.305682,57,60637,0,5710.000000,6670.22,8,0,3,0,0.0
7,8,0,0.754464,39,60625,0,0.209940,3500.00,8,0,0,0,0.0
8,9,0,0.116951,27,60804,0,46.000000,6670.22,2,0,0,0,0.0
9,10,0,0.189169,57,60629,0,0.606291,23684.00,9,0,4,0,2.0


In [44]:
#Let's check to make sure we filled all the NaNs
find_nans(filled_df)

[]

In [45]:
#Let's make income into a discrete instead of continuous variable column. Let's do three buckets.
discretize(filled_df, "MonthlyIncome", 3)

Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,60644,2,0.802982,1002917.0,13,0,6,0,2.0
1,2,0,0.957151,40,60637,0,0.121876,1002917.0,4,0,0,0,1.0
2,3,0,0.658180,38,60601,1,0.085113,1002917.0,2,1,0,0,0.0
3,4,0,0.233810,30,60601,0,0.036050,1002917.0,5,0,0,0,0.0
4,5,0,0.907239,49,60625,1,0.024926,1002917.0,7,0,1,0,0.0
5,6,0,0.213179,74,60629,0,0.375607,1002917.0,3,0,1,0,1.0
6,7,0,0.305682,57,60637,0,5710.000000,1002917.0,8,0,3,0,0.0
7,8,0,0.754464,39,60625,0,0.209940,1002917.0,8,0,0,0,0.0
8,9,0,0.116951,27,60804,0,46.000000,1002917.0,2,0,0,0,0.0
9,10,0,0.189169,57,60629,0,0.606291,1002917.0,9,0,4,0,2.0


In [46]:
#Let's make Number of Dependents into a dummy variable column. 


Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,60644,2,0.802982,1002917.0,13,0,6,0,2.0
1,2,0,0.957151,40,60637,0,0.121876,1002917.0,4,0,0,0,1.0
2,3,0,0.658180,38,60601,1,0.085113,1002917.0,2,1,0,0,0.0
3,4,0,0.233810,30,60601,0,0.036050,1002917.0,5,0,0,0,0.0
4,5,0,0.907239,49,60625,1,0.024926,1002917.0,7,0,1,0,0.0
5,6,0,0.213179,74,60629,0,0.375607,1002917.0,3,0,1,0,1.0
6,7,0,0.305682,57,60637,0,5710.000000,1002917.0,8,0,3,0,0.0
7,8,0,0.754464,39,60625,0,0.209940,1002917.0,8,0,0,0,0.0
8,9,0,0.116951,27,60804,0,46.000000,1002917.0,2,0,0,0,0.0
9,10,0,0.189169,57,60629,0,0.606291,1002917.0,9,0,4,0,2.0


In [47]:
make_dummies(filled_df, ["NumberOfDependents"])

Unnamed: 0,PersonID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,...,NumberOfDependents_3.0,NumberOfDependents_4.0,NumberOfDependents_5.0,NumberOfDependents_6.0,NumberOfDependents_7.0,NumberOfDependents_8.0,NumberOfDependents_9.0,NumberOfDependents_10.0,NumberOfDependents_13.0,NumberOfDependents_20.0
0,1,1,0.766127,45,60644,2,0.802982,1002917.0,13,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0.957151,40,60637,0,0.121876,1002917.0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0.658180,38,60601,1,0.085113,1002917.0,2,1,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0.233810,30,60601,0,0.036050,1002917.0,5,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0.907239,49,60625,1,0.024926,1002917.0,7,0,...,0,0,0,0,0,0,0,0,0,0
5,6,0,0.213179,74,60629,0,0.375607,1002917.0,3,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0.305682,57,60637,0,5710.000000,1002917.0,8,0,...,0,0,0,0,0,0,0,0,0,0
7,8,0,0.754464,39,60625,0,0.209940,1002917.0,8,0,...,0,0,0,0,0,0,0,0,0,0
8,9,0,0.116951,27,60804,0,46.000000,1002917.0,2,0,...,0,0,0,0,0,0,0,0,0,0
9,10,0,0.189169,57,60629,0,0.606291,1002917.0,9,0,...,0,0,0,0,0,0,0,0,0,0
