In [1]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from pydataset import data
import seaborn as sns

# Data Acqusition Exercises

### Exercise 1
- Use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
iris = sns.load_dataset('iris')
df_iris = pd.DataFrame(iris)

- A) print the first 3 rows

In [None]:
df_iris.head(3)

- B) print the number of rows and columns (shape)

In [None]:
df_iris.shape

- C) print the column names

In [None]:
df_iris.columns

#df_iris.columns.to_list()

- D) print the data type of each column

In [None]:
df_iris.info()
#df_iris.dtypes()

- E) print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [None]:
df_iris.describe().T

No, I would not recommend rescaling the data based on these statistics.

### Exercise 2
- Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

In [None]:
df_excel = pd.read_excel('Spreadsheets_Exercises.xlsx', sheet_name = 'Table1_CustDetails')

In [None]:
df_excel.info()

- A) assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel.head(100)

In [None]:
df_excel_sample

- B) print the number of rows of your original dataframe

In [None]:
df_excel.shape[0]

- C) print the first 5 column names

In [None]:
df_excel.columns[:5]

- D) Print the column names that have a data type of object.

In [None]:
df_excel.dtypes[df_excel.dtypes == object]

- E) compute the range for each of the numeric variables.

In [None]:
numeric_stats = df_excel.describe().T
numeric_stats['range'] = numeric_stats['max'] - numeric_stats['min']
numeric_stats

### Exercise 3

- Read the data from this google sheet into a dataframe, df_google

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

In [None]:
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

In [None]:
df_google = pd.read_csv(csv_export_url)

- A) print the first 3 rows

In [None]:
df_google.head(3)

- B) print the number of rows and columns

In [None]:
df_google.shape

- C) print the column names

In [None]:
df_google.columns

- D) print the data type of each column

In [None]:
df_google.info()

#df_google.dtypes

- E) print the summary statistics for each of the numeric variables

In [None]:
df_google.describe().T

- F) print the unique values for each of your categorical variables


In [None]:
df_google.select_dtypes(object).nunique()

In [None]:
df_google.Sex.value_counts()

In [None]:
df_google.Embarked.value_counts()

# Data Preparation Exercises

### Exercise 1: Iris Data
- A) Use the function defined in acquire.py to load the iris data.

In [None]:
import acquire

In [4]:
iris_df = acquire.get_iris_data()
iris_df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


- B) Drop the species_id and measurement_id columns.

In [None]:
cols_to_drop = ['species_id']

iris_df = iris_df.drop(columns = cols_to_drop)
iris_df.head()

- C) Rename the species_name column to just species.

In [None]:
iris_df = iris_df.rename(columns = {'species_name': 'species'})
iris_df.head()

- D) Create dummy variables of the species name.

In [None]:
dummy_df = pd.get_dummies(iris_df['species'])
iris_df = pd.concat([iris_df, dummy_df], axis = 1)
iris_df.head()

- E) Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
def prep_iris(df):
    cols_to_drop = ['species_id']
    df = df.drop(columns = cols_to_drop)
    df = df.rename(columns = {'species_name': 'species'})
    
    dummy_df = pd.get_dummies(df['species'])
    df = pd.concat([df, dummy_df], axis = 1)
    return df

In [None]:
prep_iris(iris_df)

### Exercise 2: Titanic Data
- A) Use the function you defined in acquire.py to load the titanic data set.

In [2]:
import acquire

In [3]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


- B) Handle the missing values in the embark_town and embarked columns.

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,,,,,,,,,,,,,
887,,,,,,,,,,,,,
888,,,,,,,,,,,,,
889,,,,,,,,,,,,,
