In [None]:
import env
import acquire as acq
import pandas as pd
import matplotlib as plt
import os

def prep_iris():
    '''
    function takes in data from aquire.get_titanic_data(),
    applies preparatory steps to the dataset, then splits
    the dataset into train, validate, and test groups.
    '''
    iris_df = acq.get_iris_data()
    iris_df.drop(['species_id'], axis=1, inplace=True)
    iris_df.rename(columns={'species_name' : 'species'}, inplace=True)
    dummy_df = pd.get_dummies(iris_df.species, dummy_na=False, drop_first=True)
    iris_df = pd.concat([iris_df, dummy_df], axis=1)
    return iris_df


def prep_titanic(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'], fill missing embark_town with 'Southampton'
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['deck', 'embarked', 'class', 'age'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df



def split_data(df, stratify_target='target_col_name'):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; 
    stratify on target column name. Return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                            random_state=9751, stratify=df[stratify_target])
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=9751, 
                                       stratify=train_validate[stratify_target])
    return train, validate, test


Useful Prep Code
# narrow down the dataframe to just object datatypes
df.select_dtypes(include='object').head()
# it looks like embarked and embark_town are the same deal
# let's see if thats accurate

# what about those nulls???
# using boolean masking -> which info is gone from which column?
df.isna().sum()[df.isna().sum() > 0]
df.isna().sum()[df.isna().sum() > 0] / len(df) # as percents of cols

# we can fill the null values in embark_town with the most common
# value (southhampton) b y using a fillna()
# we can reassign df['embark_town'] to this, or use an inplace=True
# NOTE!!! inplace=True changes the function to RETURN A NONETYPE
df.embark_town.fillna('Southampton',inplace=True)

# examining the distributions
# for every column present inside of df:
# check if its not an object,
# otherwise (which means its a number)
# give me that histogram
for col in df:
    if df[col].dtype != 'O':
        plt.hist(df[col])
        plt.title(f'Distribution of {col} on the Titanic')
        plt.show()
        
# Check out distributions of numeric columns.
num_cols = df.columns[[df[col].dtype == 'int64' for col in df.columns]]
for col in num_cols:
    plt.hist(df[col])
    plt.title(col)
    plt.show()
    
# Use .describe with object columns.

obj_cols = df.columns[[df[col].dtype == 'O' for col in df.columns]]
for col in obj_cols:
    print(df[col].value_counts())
    print(df[col].value_counts(normalize=True, dropna=False))
    print('----------------------')

    # Create bins for fare using .value_counts.
# Using sort = false will sort by bin values as opposed to the frequency counts.

df.fare.value_counts(bins=5, sort=False)

# Find columns with missing values and the total of missing values.

missing = df.isnull().sum()
missing[missing > 0]

# Drop duplicates...run just in case; reassign and check the shape of my data.

df = df.drop_duplicates()
df.shape

# Drop columns with too many missing values for now and reassign; check the shape of my data.

cols_to_drop = ['deck', 'embarked', 'class', 'age']
df = df.drop(columns=cols_to_drop)
df.shape

# Run .fillna() on the entire df.

df['embark_town'] = df.embark_town.fillna(value='Southampton')
# Validate that missing values in embark_town have been handled.

df.embark_town.isna().sum()

# Using drop_first leaves sex_male, embark_town_Queenstown, and embark_town_Southampton.

dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True])
dummy_df.head()

# Concatenate the dummy_df dataframe above with the original df and validate.

df = pd.concat([df, dummy_df], axis=1)
df.head(1)


        
Should I do this on the full dataset or on the train sample?
this: the action, method, function, step you are about to take on your data.

'''Are you comparing, looking at the relationship or summary stats or 
visualizations with 2+ variables?
Are you using an sklearn method?
Are you moving into the explore stage of the pipeline?
If ONE or more of these is yes, then you should be doing it on your train sample. 
If ALL are no, then the entire dataset is fine.
