In [59]:
import acquire

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split


np.random.seed(123)

### Acquiring Iris Data

In [60]:
#aquire iris data
iris_df = acquire.get_iris_data()
iris_df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


In [61]:
#rename so split works on renamed column
iris_df = iris_df.rename(columns={'species_name' : 'species'})

## Prepare Iris Data

In [62]:
#prepare function to prep iris data
def prep_iris(iris_df):
    cols_to_drop = ['species_id']
    iris_df = iris_df.drop(columns=cols_to_drop)
    iris_df = iris_df.rename(columns={'species_name' : 'species'})
    dummy_df = pd.get_dummies(iris_df[['species']], dummy_na=False)
    iris_df = pd.concat([iris_df, dummy_df], axis=1)

    return iris_df

In [63]:
#bring in fresh iris data to test prep function
iris_df = acquire.get_iris_data()
iris_df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


In [64]:
#test prep_iris function on fresh iris data
iris_df = prep_iris(iris_df)
iris_df.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,setosa,5.1,3.5,1.4,0.2,1,0,0
1,setosa,4.9,3.0,1.4,0.2,1,0,0
2,setosa,4.7,3.2,1.3,0.2,1,0,0
3,setosa,4.6,3.1,1.5,0.2,1,0,0
4,setosa,5.0,3.6,1.4,0.2,1,0,0


In [65]:
train.head()


Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
79,versicolor,5.7,2.6,3.5,1.0,0,1,0
36,setosa,5.5,3.5,1.3,0.2,1,0,0
133,virginica,6.3,2.8,5.1,1.5,0,0,1
95,versicolor,5.7,3.0,4.2,1.2,0,1,0
18,setosa,5.7,3.8,1.7,0.3,1,0,0


### Splitting Iris Data

In [66]:
train, test = train_test_split(iris_df, test_size = .2, random_state=123, stratify=iris_df.species)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.species)

In [67]:
#split data function
def split_data(iris_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(iris_df, test_size = .2, random_state=123, stratify=iris_df.species)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.species)
    return train, validate, test

In [68]:
train.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
79,versicolor,5.7,2.6,3.5,1.0,0,1,0
36,setosa,5.5,3.5,1.3,0.2,1,0,0
133,virginica,6.3,2.8,5.1,1.5,0,0,1
95,versicolor,5.7,3.0,4.2,1.2,0,1,0
18,setosa,5.7,3.8,1.7,0.3,1,0,0


### Acquiring Titanic Data

In [74]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


## Prepare Titanic Data

In [75]:
def prep_titanic(titanic_df):
    '''
    This function will clean the titanic data...
    '''
    titanic_df = titanic_df.drop_duplicates()
    cols_to_drop = ['deck', 'embarked', 'class', 'age']
    titanic_df = titanic_df.drop(columns=cols_to_drop)
    dummy_df = pd.get_dummies(titanic_df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
    
    # split the data
    train, validate, test = split_data(titanic_df)
    return titanic_df

In [76]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [77]:
titanic_df = prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [78]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0
165,165,1,3,male,0,2,20.525,Southampton,0,1,0,1
50,50,0,3,male,4,1,39.6875,Southampton,0,1,0,1
259,259,1,2,female,0,1,26.0,Southampton,0,0,0,1
306,306,1,1,female,0,0,110.8833,Cherbourg,1,0,0,0


### Splitting Titanic Data

In [79]:
#split data function
def split_data(titanic_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(titanic_df, test_size = .2, random_state=123, stratify=titanic_df.survived)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
    return train, validate, test

In [80]:
train, test = train_test_split(titanic_df, test_size = .2, random_state=123, stratify=titanic_df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)

In [81]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0
165,165,1,3,male,0,2,20.525,Southampton,0,1,0,1
50,50,0,3,male,4,1,39.6875,Southampton,0,1,0,1
259,259,1,2,female,0,1,26.0,Southampton,0,0,0,1
306,306,1,1,female,0,0,110.8833,Cherbourg,1,0,0,0


### Acquiring Telco Data

In [91]:
telco_df = acquire.get_telco_data()
telco_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


## Prepare Telco Data

In [92]:
def prep_telco(telco_df):
    '''
    This function will clean the telco data...
    '''
    #Drop Duplicates
    telco_df = telco_df.drop_duplicates()
    
    # Drop null values stored as whitespace    
    telco_df['total_charges'] = telco_df['total_charges'].str.strip()
    telco_df = telco_df[telco_df.total_charges != '']
    
    # Convert to correct datatype
    telco_df['total_charges'] = telco_df.total_charges.astype(float)
    
    # Drop Columns
    cols_to_drop = ['customer_id', 'payment_type_id', 'internet_service_type_id', 'contract_type_id']
    telco_df = telco_df.drop(columns=cols_to_drop)
    
    # Get dummies for non-binary categorical variables
    dummy_df = pd.get_dummies(telco_df[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type']], dummy_na=False)
    # Concatenate dummy dataframe to original 
    telco_df = pd.concat([telco_df, dummy_df], axis=1)
   
    return telco_df

In [93]:
telco_df = acquire.get_telco_data()
telco_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [94]:
telco_df = prep_telco(telco_df)
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,Female,0,Yes,Yes,65,Yes,Yes,Yes,Yes,Yes,...,0,0,1,1,0,0,0,0,0,1
1,Male,0,No,No,54,No,No phone service,Yes,No,No,...,0,0,1,1,0,0,0,1,0,0
2,Female,0,No,No,56,No,No phone service,Yes,Yes,Yes,...,0,0,1,1,0,0,1,0,0,0
3,Male,0,Yes,Yes,20,No,No phone service,Yes,No,Yes,...,0,0,1,1,0,0,0,1,0,0
4,Male,0,Yes,No,72,Yes,Yes,No,Yes,Yes,...,0,0,1,1,0,0,1,0,0,0


In [95]:
train.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
5642,4,3,3,1982-FEBTD,Female,0,Yes,Yes,23,Yes,...,No internet service,No internet service,No internet service,Yes,25.6,514.75,No,Two year,,Credit card (automatic)
3338,3,2,2,8982-NHAVY,Male,0,No,No,27,Yes,...,No,Yes,Yes,Yes,100.5,2673.45,No,One year,Fiber optic,Bank transfer (automatic)
4273,1,2,1,4115-NZRKS,Female,1,No,No,7,Yes,...,No,Yes,Yes,Yes,89.15,574.35,No,Month-to-month,Fiber optic,Electronic check
2382,3,1,1,9633-DENPU,Female,0,Yes,No,47,Yes,...,No,Yes,No,Yes,58.6,2723.4,No,Month-to-month,DSL,Bank transfer (automatic)
2655,4,2,3,5693-PIPCS,Male,0,No,No,41,Yes,...,Yes,Yes,Yes,Yes,99.65,4220.35,No,Two year,Fiber optic,Credit card (automatic)


### Splitting Telco Data

In [96]:
#split data function
def split_data(telco_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(telco_df, test_size = .2, random_state=123, stratify=telco_df.churn)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.churn)
    return train, validate, test

In [97]:
train, test = train_test_split(telco_df, test_size = .2, random_state=123, stratify=telco_df.churn)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.churn)

In [98]:
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
6096,Female,0,Yes,No,70,Yes,No,No internet service,No internet service,No internet service,...,0,0,1,0,0,1,1,0,0,0
1603,Male,0,Yes,Yes,15,Yes,No,No,Yes,Yes,...,1,0,0,1,0,0,0,0,0,1
5350,Female,1,Yes,No,52,Yes,Yes,Yes,Yes,Yes,...,1,0,0,0,1,0,0,0,1,0
2068,Male,0,No,No,39,No,No phone service,No,No,No,...,1,0,0,1,0,0,0,0,1,0
6366,Female,0,Yes,No,32,Yes,No,No internet service,No internet service,No internet service,...,0,1,0,0,0,1,0,0,0,1
