# Classification Exercises

## Data Acquisition

## 4. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.


In [2]:
from pydataset import data
import pandas as pd

In [None]:
df_iris = data('iris')

* print the first 3 rows

In [None]:
print(df_iris.head(3))

* print the number of rows and columns (shape)

In [None]:
print(df_iris.shape)

* print the column names

In [None]:
print(df_iris.columns.values)

* print the data type of each column

In [None]:
print(df_iris.info())

* print the summary statistics for each of the numeric variables

In [None]:
print(df_iris.describe())

## 5. Read the data from this google sheet into a dataframe, df_google.

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)


* print the first 3 rows

In [None]:
print(df_google.head(3))

* print the number of rows and columns (shape)

In [None]:
print(df_google.shape)

* print the column names

In [None]:
print(df_google.columns.values)

* print the data type of each column

In [None]:
print(df_google.info())

* print the summary statistics for each of the numeric variables

In [None]:
print(df_google.describe())

## 6. Download the previous exercise's file into an excel (File → Download → Microsoft Excel). Read the downloaded file into a dataframe named df_excel.

In [None]:
df_excel = pd.read_excel('train.xlsx')

* assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel.head(100)
df_excel_sample

* print the number of rows of your original dataframe

In [None]:
print(len(df_excel))

* print the first 5 column names

In [None]:
print(df_excel.columns.values[:5])

* print the column names that have a data type of object

In [None]:
print(df_excel.select_dtypes(include='object').columns)

* compute the range for each of the numeric variables.

In [None]:
df_range = (df_excel.select_dtypes(exclude='object').max()) - (df_excel.select_dtypes(exclude='object').min())
print(df_range)

# Data Preparation

In [5]:
import acquire
import pandas as pd

## Using the Iris Data

### 1. Use the function defined in acquire.py to load the iris data.

In [3]:
iris_df = acquire.get_iris_data()
iris_df

Unnamed: 0.1,Unnamed: 0,species_id,species_name,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id.1
0,0,1,setosa,1,5.1,3.5,1.4,0.2,1
1,1,1,setosa,2,4.9,3.0,1.4,0.2,1
2,2,1,setosa,3,4.7,3.2,1.3,0.2,1
3,3,1,setosa,4,4.6,3.1,1.5,0.2,1
4,4,1,setosa,5,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...,...,...,...,...
145,145,3,virginica,146,6.7,3.0,5.2,2.3,3
146,146,3,virginica,147,6.3,2.5,5.0,1.9,3
147,147,3,virginica,148,6.5,3.0,5.2,2.0,3
148,148,3,virginica,149,6.2,3.4,5.4,2.3,3


### 2. Drop the species_id and measurement_id columns.

In [5]:
dropped_col = ['species_id', 'measurement_id']
iris_df = iris_df.drop(columns=dropped_col)
iris_df

Unnamed: 0.1,Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width,species_id.1
0,0,setosa,5.1,3.5,1.4,0.2,1
1,1,setosa,4.9,3.0,1.4,0.2,1
2,2,setosa,4.7,3.2,1.3,0.2,1
3,3,setosa,4.6,3.1,1.5,0.2,1
4,4,setosa,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...,...,...
145,145,virginica,6.7,3.0,5.2,2.3,3
146,146,virginica,6.3,2.5,5.0,1.9,3
147,147,virginica,6.5,3.0,5.2,2.0,3
148,148,virginica,6.2,3.4,5.4,2.3,3


### 3. Rename the species_name column to just species.

In [7]:
iris_df = iris_df.rename(columns={'species_name':'species'})
iris_df

Unnamed: 0.1,Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_id.1
0,0,setosa,5.1,3.5,1.4,0.2,1
1,1,setosa,4.9,3.0,1.4,0.2,1
2,2,setosa,4.7,3.2,1.3,0.2,1
3,3,setosa,4.6,3.1,1.5,0.2,1
4,4,setosa,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...,...,...
145,145,virginica,6.7,3.0,5.2,2.3,3
146,146,virginica,6.3,2.5,5.0,1.9,3
147,147,virginica,6.5,3.0,5.2,2.0,3
148,148,virginica,6.2,3.4,5.4,2.3,3


### 4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [11]:
dummy_df = pd.get_dummies(iris_df.species, dummy_na=False, drop_first=True)
iris_df = pd.concat([iris_df, dummy_df], axis=1)
iris_df

Unnamed: 0.1,Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_id.1,versicolor,virginica,versicolor.1,virginica.1
0,0,setosa,5.1,3.5,1.4,0.2,1,0,0,0,0
1,1,setosa,4.9,3.0,1.4,0.2,1,0,0,0,0
2,2,setosa,4.7,3.2,1.3,0.2,1,0,0,0,0
3,3,setosa,4.6,3.1,1.5,0.2,1,0,0,0,0
4,4,setosa,5.0,3.6,1.4,0.2,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
145,145,virginica,6.7,3.0,5.2,2.3,3,0,1,0,1
146,146,virginica,6.3,2.5,5.0,1.9,3,0,1,0,1
147,147,virginica,6.5,3.0,5.2,2.0,3,0,1,0,1
148,148,virginica,6.2,3.4,5.4,2.3,3,0,1,0,1


## 5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [12]:
def prep_iris(df):
    '''This function will drop duplicate observations, drop 'species_id' and 'measurement_id',
        rename the 'species_name' column to 'species', and create dummy variables for species. '''
    
    df = df.drop_duplicates()
    df = df.drop(columns=['species_id', 'measurement_id'])
    df = df.rename(columns={'species_name':'species'})
    dummy_df = pd.get_dummies(iris_df.species, dummy_na=False, drop_first=True)
    df = pd.concat([iris_df, dummy_df], axis=1)
    return df

In [14]:
fresh_iris = acquire.get_iris_data()
prep_iris(fresh_iris)

Unnamed: 0.1,Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_id.1,versicolor,virginica,versicolor.1,virginica.1,versicolor.2,virginica.2
0,0,setosa,5.1,3.5,1.4,0.2,1,0,0,0,0,0,0
1,1,setosa,4.9,3.0,1.4,0.2,1,0,0,0,0,0,0
2,2,setosa,4.7,3.2,1.3,0.2,1,0,0,0,0,0,0
3,3,setosa,4.6,3.1,1.5,0.2,1,0,0,0,0,0,0
4,4,setosa,5.0,3.6,1.4,0.2,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,145,virginica,6.7,3.0,5.2,2.3,3,0,1,0,1,0,1
146,146,virginica,6.3,2.5,5.0,1.9,3,0,1,0,1,0,1
147,147,virginica,6.5,3.0,5.2,2.0,3,0,1,0,1,0,1
148,148,virginica,6.2,3.4,5.4,2.3,3,0,1,0,1,0,1


## Using the Titanic Dataset

### 1. Use the function defined in acquire.py to load the Titanic data.

In [17]:
titanic_df = acquire.get_titanic_data()
titanic_df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


## 2. Drop any unnecessary, unhelpful, or duplicated columns.

In [23]:
titanic_df = titanic_df.drop(columns=['deck', 'class', 'embarked', 'age'])
titanic_df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,0,0,3,male,1,0,7.2500,Southampton,0
1,1,1,1,1,female,1,0,71.2833,Cherbourg,0
2,2,2,1,3,female,0,0,7.9250,Southampton,1
3,3,3,1,1,female,1,0,53.1000,Southampton,0
4,4,4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,0,0,13.0000,Southampton,1
887,887,887,1,1,female,0,0,30.0000,Southampton,1
888,888,888,0,3,female,1,2,23.4500,Southampton,0
889,889,889,1,1,male,0,0,30.0000,Cherbourg,1


## 3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [26]:
dummy_df = pd.get_dummies(titanic_df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
titanic_df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


## 4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [29]:
def prep_titanic(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'],
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['deck', 'embarked', 'class', 'age'])
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

fresh_df = acquire.get_titanic_data()
prep_titanic(fresh_df)

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


## Using Telco Data

## 1. Use the function defined in acquire.py to load the Telco data.

In [16]:
telco_df = acquire.get_telco_data()
telco_df

Unnamed: 0.1,Unnamed: 0,internet_service_type_id,contract_type_id,payment_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,0,1,2,2,0002-ORFBO,Female,0,Yes,Yes,9,...,Yes,Yes,No,Yes,65.60,593.3,No,Mailed check,One year,DSL
1,1,1,1,2,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.90,542.4,No,Mailed check,Month-to-month,DSL
2,2,2,1,1,0004-TLHLJ,Male,0,No,No,4,...,No,No,No,Yes,73.90,280.85,Yes,Electronic check,Month-to-month,Fiber optic
3,3,2,1,1,0011-IGKFF,Male,1,Yes,No,13,...,No,Yes,Yes,Yes,98.00,1237.85,Yes,Electronic check,Month-to-month,Fiber optic
4,4,2,1,2,0013-EXCHZ,Female,1,Yes,No,3,...,Yes,Yes,No,Yes,83.90,267.4,Yes,Mailed check,Month-to-month,Fiber optic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,1,2,2,9987-LUTYD,Female,0,No,No,13,...,Yes,No,No,No,55.15,742.9,No,Mailed check,One year,DSL
7039,7039,2,1,1,9992-RRAMN,Male,0,Yes,No,22,...,No,No,Yes,Yes,85.10,1873.7,Yes,Electronic check,Month-to-month,Fiber optic
7040,7040,1,1,2,9992-UJOEL,Male,0,No,No,2,...,No,No,No,Yes,50.30,92.75,No,Mailed check,Month-to-month,DSL
7041,7041,1,3,2,9993-LHIEB,Male,0,Yes,Yes,67,...,Yes,No,Yes,No,67.85,4627.65,No,Mailed check,Two year,DSL


## 2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [17]:
telco_df = telco_df.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])
telco_df

Unnamed: 0.1,Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,...,Yes,Yes,No,Yes,65.60,593.3,No,Mailed check,One year,DSL
1,1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,...,No,No,Yes,No,59.90,542.4,No,Mailed check,Month-to-month,DSL
2,2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,...,No,No,No,Yes,73.90,280.85,Yes,Electronic check,Month-to-month,Fiber optic
3,3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,...,No,Yes,Yes,Yes,98.00,1237.85,Yes,Electronic check,Month-to-month,Fiber optic
4,4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,...,Yes,Yes,No,Yes,83.90,267.4,Yes,Mailed check,Month-to-month,Fiber optic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,...,Yes,No,No,No,55.15,742.9,No,Mailed check,One year,DSL
7039,7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,...,No,No,Yes,Yes,85.10,1873.7,Yes,Electronic check,Month-to-month,Fiber optic
7040,7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,...,No,No,No,Yes,50.30,92.75,No,Mailed check,Month-to-month,DSL
7041,7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,...,Yes,No,Yes,No,67.85,4627.65,No,Mailed check,Two year,DSL


## 3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [18]:
dummy_df = pd.get_dummies(telco_df[['gender', 'partner', 'dependents', 'phone_service','multiple_lines', 'online_security', 'online_backup','device_protection', 'tech_support', 'streaming_tv', 'streaming_movies','paperless_billing', 'churn', 'payment_type','contract_type', 'internet_service_type']], dummy_na=False, drop_first=True)
telco_df = pd.concat([telco_df, dummy_df], axis=1)
telco_df

Unnamed: 0.1,Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None
0,0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,...,0,1,0,0,0,1,1,0,0,0
1,1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,...,1,0,0,0,0,1,0,0,0,0
2,2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,...,0,1,1,0,1,0,0,0,1,0
3,3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,...,1,1,1,0,1,0,0,0,1,0
4,4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,...,0,1,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,...,0,0,0,0,0,1,1,0,0,0
7039,7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,...,1,1,1,0,1,0,0,0,1,0
7040,7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,...,0,1,0,0,0,1,0,0,0,0
7041,7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,...,1,0,0,0,0,1,0,1,0,0


## 4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [19]:
def prep_telco(df):
    '''This function removes duplicates, 
       removes the columns 'internet_service_type_id', 'contract_type_id', 'payment_type_id',
       and creates dummy variables for 'gender', 'partner', 'dependents', 'phone_service',
       'multiple_lines', 'online_security', 'online_backup','device_protection', 'tech_support', 'streaming_tv',
       'streaming_movies','paperless_billing', 'churn', 'payment_type','contract_type', 'internet_service_type'.'''
    df = df.drop_duplicates()
    df = df.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])
    dummy_df = pd.get_dummies(telco_df[['gender', 'partner', 'dependents', 'phone_service','multiple_lines', 'online_security', 'online_backup','device_protection', 'tech_support', 'streaming_tv', 'streaming_movies','paperless_billing', 'churn', 'payment_type','contract_type', 'internet_service_type']], dummy_na=False, drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

fresh_df = acquire.get_telco_data()
prep_telco(fresh_df)

Unnamed: 0.1,Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None
0,0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,...,0,1,0,0,0,1,1,0,0,0
1,1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,...,1,0,0,0,0,1,0,0,0,0
2,2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,...,0,1,1,0,1,0,0,0,1,0
3,3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,...,1,1,1,0,1,0,0,0,1,0
4,4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,...,0,1,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,...,0,0,0,0,0,1,1,0,0,0
7039,7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,...,1,1,1,0,1,0,0,0,1,0
7040,7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,...,0,1,0,0,0,1,0,0,0,0
7041,7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,...,1,0,0,0,0,1,0,1,0,0


## Split Data

In [21]:
from sklearn.model_selection import train_test_split
def split_data(df):
    '''This function takes a dataframe and returns the train, validate, and test dataframes.'''
    
    train_validate, test = train_test_split(df, test_size = .2, random_state=311)
    train, validate = train_test_split(train_validate, test_size = .3, random_state=311)
    
    return train, validate, test

## 1. Test split on iris data.

In [60]:
train, validate, test = split_data(iris_df)

In [61]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')


train -> (84, 11)
validate -> (36, 11)
test -> (30, 11)


## 2. Test on titanic data.

In [62]:
train, validate, test = split_data(titanic_df)

In [63]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (498, 13)
validate -> (214, 13)
test -> (179, 13)


## 3. Test on telco data.

In [22]:
train, validate, test = split_data(telco_df)

In [23]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (3943, 49)
validate -> (1691, 49)
test -> (1409, 49)
