# Classification Exercises

In [1]:
import pandas as pd
from pydataset import data
import split_scale

import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

import acquire
import prepare

## Data Acquisition

## 1. Use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [2]:
df_iris = data("iris")

* print the first 3 rows

In [3]:
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


* print the number of rows and columns (shape)

In [4]:
df_iris.shape

(150, 5)

* print the column names

In [5]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

* print the data type of each column

In [6]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     150 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


* print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [7]:
df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## 2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

In [8]:
df_excel = pd.read_excel("Spreadsheets_Exercises.xlsx")

* assign the first 100 rows to a new dataframe, df_excel_sample

In [9]:
df_excel_sample = df_excel.head(100)
df_excel_sample.shape

(100, 12)

* print the number of rows of your original dataframe

In [10]:
df_excel.shape

(7049, 12)

* print the first 5 column names

In [11]:
df_excel.columns[:5]

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')

* print the column names that have a data type of object 

In [12]:
df_excel.select_dtypes(include="object").columns

Index(['customer_id', 'gender', 'partner', 'dependents', 'payment_type',
       'churn'],
      dtype='object')

* compute the range for each of the numeric variables.

In [13]:
df_excel.select_dtypes(exclude="object").apply(lambda col: col.max() - col.min())

is_senior_citizen       1.0
phone_service           2.0
internet_service        2.0
contract_type           2.0
monthly_charges       100.5
total_charges        8666.0
dtype: float64

## 3. Read the data from this google sheet into a dataframe, df_google

In [14]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)

print the first 3 rows

In [15]:
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


print the number of rows and columns

In [16]:
df_google.shape

(891, 12)

print the column names

In [17]:
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

print the data type of each column

In [18]:
df_google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


print the summary statistics for each of the numeric variables

In [19]:
df_google.select_dtypes(exclude="object").apply(lambda col: col.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


print the unique values for each of your categorical variables

In [20]:
df_google.select_dtypes(include="object").apply(lambda col: col.nunique(dropna = True))

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [21]:
for col in list(df_google.select_dtypes(include="object")):
    print(col)
    print('-------------')
    print(df_google[col].value_counts(dropna=False))
    print('-------------')

Name
-------------
Lam, Mr. Len                                           1
Guggenheim, Mr. Benjamin                               1
Arnold-Franchi, Mr. Josef                              1
Matthews, Mr. William John                             1
Goodwin, Miss. Lillian Amy                             1
                                                      ..
Vande Walle, Mr. Nestor Cyriel                         1
Louch, Mrs. Charles Alexander (Alice Adelaide Slow)    1
Lemore, Mrs. (Amelia Milley)                           1
Hold, Mr. Stephen                                      1
Wells, Miss. Joan                                      1
Name: Name, Length: 891, dtype: int64
-------------
Sex
-------------
male      577
female    314
Name: Sex, dtype: int64
-------------
Ticket
-------------
347082      7
CA. 2343    7
1601        7
347088      6
CA 2144     6
           ..
4579        1
349245      1
19947       1
364511      1
65304       1
Name: Ticket, Length: 681, dtype: int64
---

# Data Preparation Exercises

## 1 Use the function defined in acquire.py to load the iris data.

In [22]:
import acquire

In [23]:
iris = acquire.get_iris_data()

Drop the species_id and measurement_id columns.

In [24]:
iris = iris.drop(columns=["species_id", "measurement_id"])

Rename the species_name column to just species.

In [25]:
iris = iris.rename(columns = {"species_name" : "species"})

Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

The `inverse_transform` seems to be the opposite, and lets us reverse the effects of the encoder.

In [26]:
train, test = sklearn.model_selection.train_test_split(iris, random_state=123, train_size= .8)

In [27]:
le = LabelEncoder()

In [28]:
train['species'] = le.fit_transform(train.species)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
test['species'] = le.transform(test.species)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
# encoder = sklearn.preprocessing.OneHotEncoder()

# encoder.fit(train[["species"]])

# m = encoder.transform(train[["species"]]).todense()

# train = pd.concat([train, pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)], axis = 1).drop(columns="species")

# m = encoder.transform(test[["species"]]).todense()

# test = pd.concat([test, pd.DataFrame(m, columns=encoder.categories_[0], index=test.index)], axis = 1).drop(columns="species")

Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [31]:
def drop_columns(df):
    return df.drop(columns=[
        "species_id",
        "measurement_id"
        
    ])

def rename_columns(df):
    return df.rename(columns = {"species_name" : "species"})

def encode_iris(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()

    encoder.fit(train[["species"]])

    m = encoder.transform(train[["species"]]).todense()

    train = pd.concat([train, pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)], axis = 1).drop(columns="species")

    m = encoder.transform(test[["species"]]).todense()

    test = pd.concat([test, pd.DataFrame(m, columns=encoder.categories_[0], index=test.index)], axis = 1).drop(columns="species")
    
    return train, test

def prep_iris(df):
    df = drop_columns(df)
    df = rename_columns(df)
    train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size= .8)
    train, test = encode_iris(train, test)
    return train, test

## 2 Use the function you defined in acquire.py to load the titanic data set.

In [32]:
titanic = acquire.get_titanic_data()

In [33]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


Handle the missing values in the embark_town and embarked columns.

In [34]:
titanic.embark_town = titanic.embark_town.fillna(titanic.embark_town.value_counts().head(1).index[0])

In [35]:
titanic.embarked = titanic.embarked.fillna(titanic.embarked.value_counts().head(1).index[0])

Remove the deck column.

In [36]:
titanic = titanic.drop(columns="deck")

Use a label encoder to transform the embarked column.

In [37]:
train, test = sklearn.model_selection.train_test_split(titanic, random_state=123, train_size = .8)

In [38]:
encoder = sklearn.preprocessing.OneHotEncoder()

encoder.fit(train[["embarked"]])

m = encoder.transform(train[["embarked"]]).todense()

train = pd.concat([train, pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)], axis = 1).drop(columns="embarked")

m = encoder.transform(test[["embarked"]]).todense()

test = pd.concat([test, pd.DataFrame(m, columns=encoder.categories_[0], index=test.index)], axis = 1).drop(columns="embarked")


Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

This is beneficial because age and fare are now in a more comparable format with each other. 

This is not benefecial when the data is on the same scale, or there is only one X variable

In [39]:
X_train = train[["age", "fare"]]
X_test = test[["age", "fare"]]

scaler, train_scaled, test_scaled = split_scale.min_max_scaler(X_train, X_test)

In [40]:
train["age"] = train_scaled["age"]
train["fare"] = train_scaled["fare"]
test["age"] = test_scaled["age"]
test["fare"] = test_scaled["fare"]

Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.

In [41]:
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,C,Q,S
172,172,1,3,female,0.003891,1,1,0.044979,Third,Southampton,0,0.0,0.0,1.0
524,524,0,3,male,,0,0,0.029206,Third,Cherbourg,1,1.0,0.0,0.0
452,452,0,1,male,0.455253,0,0,0.112112,First,Cherbourg,1,1.0,0.0,0.0
170,170,0,1,male,0.937743,0,0,0.135342,First,Southampton,1,0.0,0.0,1.0
620,620,0,3,male,0.40856,1,0,0.058396,Third,Cherbourg,0,1.0,0.0,0.0


In [42]:
imputer = sklearn.impute.SimpleImputer(strategy = "mean")

imputer.fit(train[["age"]])

train.age = imputer.transform(train[["age"]])
test.age = imputer.transform(test[["age"]])

Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.