# Method Chain

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns

df = (
    pd.read_csv ('../data/raw/adult.data')
    .drop([' State-gov', ' 77516', ' 13', ' 2174', ' 0', ' 40', 'Unnamed: 0', ' Never-married', ' Adm-clerical', ' Not-in-family'], axis=1)
    .rename({' State-gov': 'Work Class', '39':'Age', ' <=50K':'Income', ' White':'Race', ' Adm-clerical': 'Occupation', ' Bachelors': 'Education Level', ' Never-married': 'Marital-Status', ' Male': 'Sex', ' Not-in-family': 'Relationship', ' United-States': 'Native-Country'}, axis=1)
    .replace([' 1st-4th', ' 5th-6th', ' 7th-8th'], 'Elementary')
    .replace([' 9th', ' 10th', ' 11th', ' 12th'], 'High School')
)

df

Unnamed: 0,Age,Education Level,Race,Sex,Native-Country,Income
0,50,Bachelors,White,Male,United-States,<=50K
1,38,HS-grad,White,Male,United-States,<=50K
2,53,High School,Black,Male,United-States,<=50K
3,28,Bachelors,Black,Female,Cuba,<=50K
4,37,Masters,White,Female,United-States,<=50K
...,...,...,...,...,...,...
32555,27,Assoc-acdm,White,Female,United-States,<=50K
32556,40,HS-grad,White,Male,United-States,>50K
32557,58,HS-grad,White,Female,United-States,<=50K
32558,22,HS-grad,White,Male,United-States,<=50K


# Wrapping Method chain in a function
Look in your ProcessData for how you want to update this

In [34]:
path='../data/raw/adult.data'
def load_and_process(path):

    # Method Chain 1 (Load data and deal with missing data)

    df1 = (
        pd.read_csv('../data/raw/adult.data')
        .drop([' State-gov', ' 77516', ' 13', ' 2174', ' 0', ' 40', 'Unnamed: 0', ' Never-married', ' Adm-clerical', ' Not-in-family'], axis=1)
        .rename({' State-gov': 'Work Class', '39':'Age', ' <=50K':'Income', ' White':'Race', ' Adm-clerical': 'Occupation', ' Bachelors': 'Education Level', ' Never-married': 'Marital-Status', ' Male': 'Sex', ' Not-in-family': 'Relationship', ' United-States': 'Native-Country'}, axis=1)
    )

    # Method Chain 2 (Create new columns, drop others, and do processing)

    df2 = (
        df1
        .replace([' 1st-4th', ' 5th-6th', ' 7th-8th'], 'Elementary')
        .replace([' 9th', ' 10th', ' 11th', ' 12th'], 'High School')
    )

    # Make sure to return the latest dataframe

    return df2 

load_and_process(path)

Unnamed: 0,Age,Education Level,Race,Sex,Native-Country,Income
0,50,Bachelors,White,Male,United-States,<=50K
1,38,HS-grad,White,Male,United-States,<=50K
2,53,High School,Black,Male,United-States,<=50K
3,28,Bachelors,Black,Female,Cuba,<=50K
4,37,Masters,White,Female,United-States,<=50K
...,...,...,...,...,...,...
32555,27,Assoc-acdm,White,Female,United-States,<=50K
32556,40,HS-grad,White,Male,United-States,>50K
32557,58,HS-grad,White,Female,United-States,<=50K
32558,22,HS-grad,White,Male,United-States,<=50K


# Python module for importing Project_Functions

In [35]:
from scripts import project_functions # This is called a relative import
df = project_functions.load_and_process('../data/raw/adult.data')
df

Unnamed: 0,Education Level,Race,Sex,Native-Country,Income
0,Bachelors,White,Male,United-States,<=50K
1,HS-grad,White,Male,United-States,<=50K
2,High School,Black,Male,United-States,<=50K
3,Bachelors,Black,Female,Cuba,<=50K
4,Masters,White,Female,United-States,<=50K
...,...,...,...,...,...
32555,Assoc-acdm,White,Female,United-States,<=50K
32556,HS-grad,White,Male,United-States,>50K
32557,HS-grad,White,Female,United-States,<=50K
32558,HS-grad,White,Male,United-States,<=50K
