In [6]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from pydataset import data
import seaborn as sns
import numpy as np

# Data Acqusition Exercises

### Exercise 1
- Use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [None]:
iris = sns.load_dataset('iris')
df_iris = pd.DataFrame(iris)

- A) print the first 3 rows

In [None]:
df_iris.head(3)

- B) print the number of rows and columns (shape)

In [None]:
df_iris.shape

- C) print the column names

In [None]:
df_iris.columns

#df_iris.columns.to_list()

- D) print the data type of each column

In [None]:
df_iris.info()
#df_iris.dtypes()

- E) print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [None]:
df_iris.describe().T

No, I would not recommend rescaling the data based on these statistics.

### Exercise 2
- Read the Table1_CustDetails table from the Excel_Exercises.xlsx file into a dataframe named df_excel.

In [None]:
df_excel = pd.read_excel('Spreadsheets_Exercises.xlsx', sheet_name = 'Table1_CustDetails')

In [None]:
df_excel.info()

- A) assign the first 100 rows to a new dataframe, df_excel_sample

In [None]:
df_excel_sample = df_excel.head(100)

In [None]:
df_excel_sample

- B) print the number of rows of your original dataframe

In [None]:
df_excel.shape[0]

- C) print the first 5 column names

In [None]:
df_excel.columns[:5]

- D) Print the column names that have a data type of object.

In [None]:
df_excel.dtypes[df_excel.dtypes == object]

- E) compute the range for each of the numeric variables.

In [None]:
numeric_stats = df_excel.describe().T
numeric_stats['range'] = numeric_stats['max'] - numeric_stats['min']
numeric_stats

### Exercise 3

- Read the data from this google sheet into a dataframe, df_google

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

In [None]:
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

In [None]:
df_google = pd.read_csv(csv_export_url)

- A) print the first 3 rows

In [None]:
df_google.head(3)

- B) print the number of rows and columns

In [None]:
df_google.shape

- C) print the column names

In [None]:
df_google.columns

- D) print the data type of each column

In [None]:
df_google.info()

#df_google.dtypes

- E) print the summary statistics for each of the numeric variables

In [None]:
df_google.describe().T

- F) print the unique values for each of your categorical variables


In [None]:
df_google.select_dtypes(object).nunique()

In [None]:
df_google.Sex.value_counts()

In [None]:
df_google.Embarked.value_counts()

Make a new python module, acquire.py to hold the following data aquisition functions:

Make a function named get_titanic_data that returns the titanic data from the codeup data science database as a pandas data frame. Obtain your data from the Codeup Data Science Database.

In [None]:
from env import host, user, password
import os

In [None]:
def get_connection(db, user = user, host = host, password = password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [None]:
def new_titanic_data():
    sql_query = 'SELECT * FROM passengers'
    df = pd.read_sql(sql_query, get_connection('titanic_db'))
    df.to_csv('titanic.csv')
    return df

In [None]:
new_titanic_data()

Make a function named get_iris_data that returns the data from the iris_db on the codeup data science database as a pandas data frame. The returned data frame should include the actual name of the species in addition to the species_ids. Obtain your data from the Codeup Data Science Database.

In [None]:
def new_iris_data():
    sql_query = 'SELECT * FROM measurements AS m JOIN species USING (species_id)'
    df = pd.read_sql(sql_query, get_connection('iris_db'))
    df.to_csv('iris.csv')
    return df

In [None]:
new_iris_data()

Once you've got your get_titanic_data and get_iris_data functions written, now it's time to add caching to them. To do this, edit the beginning of the function to check for a local filename like titanic.csv or iris.csv. If they exist, use the .csv file. If the file doesn't exist, then produce the SQL and pandas necessary to create a dataframe, then write the dataframe to a .csv file with the appropriate name.

In [None]:
def get_titanic_data(cached=False):
    if cached or os.path.isfile('titanic.csv') == False:
        df = new_titanic_data()
    else:
        df = pd.read_csv('titanic.csv', index_col=0)
    return df

In [None]:
get_titanic_data()

In [None]:
def get_iris_data(cached=False):
    if cached or os.path.isfile('iris.csv') == False:
        df = new_iris_data()
    else:
        df = pd.read_csv('iris.csv', index_col=0)
    return df

In [None]:
get_iris_data()

# Data Preparation Exercises

### Exercise 1: Iris Data
- A) Use the function defined in acquire.py to load the iris data.

In [None]:
import acquire

In [None]:
iris_df = acquire.get_iris_data()
iris_df.head()

- B) Drop the species_id and measurement_id columns.

In [None]:
cols_to_drop = ['species_id', 'measurement_id']

iris_df = iris_df.drop(columns = cols_to_drop)
iris_df.head()

- C) Rename the species_name column to just species.

In [None]:
iris_df = iris_df.rename(columns = {'species_name': 'species'})
iris_df.head()

- D) Create dummy variables of the species name.

In [None]:
dummy_df = pd.get_dummies(iris_df['species'])
iris_df = pd.concat([iris_df, dummy_df], axis = 1)
iris_df.head()

- E) Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
def prep_iris(df):
    cols_to_drop = ['species_id', 'measurement_id']
    df = df.drop(columns = cols_to_drop)
    df = df.rename(columns = {'species_name': 'species'})
    
    dummy_df = pd.get_dummies(df['species'])
    df = pd.concat([df, dummy_df], axis = 1)
    return df

In [None]:
iris = acquire.get_iris_data()

In [None]:
prep_iris(iris)

### Exercise 2: Titanic Data
- A) Use the function you defined in acquire.py to load the titanic data set.

In [1]:
import acquire

In [2]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


- B) Handle the missing values in the embark_town and embarked columns.

In [3]:
titanic_df = titanic_df.drop(columns = 'embark_town')

In [4]:
titanic_df = titanic_df[~titanic_df.embarked.isnull()]
titanic_df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,1
888,888,0,3,female,,1,2,23.4500,S,Third,,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,1


- C) Create a dummy variable of the embarked column.

In [7]:
dummy_df = pd.get_dummies(titanic_df['embarked'], dummy_na = False)
titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
titanic_df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,alone,C,Q,S
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,0,0,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,0,1,0,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,0,0,0,1
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,1,0,0,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,1,0,0,1
888,888,0,3,female,,1,2,23.4500,S,Third,,0,0,0,1
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,1,1,0,0


- D) Scale the age and fare columns using a min max scaler. 

# Data Exploration Exercises


### Exercise 1: 
- Use the iris dataset. As always, add, commit, and push your changes. Split your data into train, test, and validate samples.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [None]:
#will have to use the acquire instead of loading straight from the dataset
iris = sns.load_dataset('iris')
iris.head()

In [None]:
#splitting the dataframe into 2 subjects,
#one group contains the test with .2 of the total,
#while the other contains the train_validate
#random state is constant to ensure repeatable results
#stratify is the subject matter in question

train_validate, test = train_test_split(
    iris, 
    test_size = .15,
    random_state = 123,
    stratify = iris.species)

#splitting the train_validate into train and validate,
#the validate group contains .3 of the train_validate data,
#random state is constant to ensure repeatable results,
#stratify is the subject matter in question
train, validate = train_test_split(
    train_validate, 
    test_size = .15,
    random_state = 123,
    stratify = train_validate.species)

In [None]:
#printing the shape of the train, vaidate and test dataframes
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

### Exercise 2:
- Create a swarmplot using a melted dataframe of all your numeric variables. The x-axis should be the variable name, the y-axis is the measure. Add another dimension using color to represent species. Document takeaways from this visualization.

In [None]:
train.head()

In [None]:
#melting the train dataframe using 'species' as the identifying variable,
#melting turns all the other columns into one column called variable,
#with all other values listed in another column
train_melt = train.melt(id_vars = 'species')
train_melt

In [None]:
#using the train_melt dataframe to create a swarmplot
sns.swarmplot(data = train_melt, x = 'variable', y = 'value', hue = 'species')
plt.show()

__Takeaways__: 
- There is a fairly clear line in the difference between species petal length and species petal width
- The sepal length and sepal width is not so significant. 
- Sepal width specifically is intertwined within the other species that we would not be able to make any clear conclusions. 

### Exercise 3: 
- Create 4 subplots (2 rows x 2 columns) of scatterplots.
    - sepal_length x sepal_width
    - petal_length x petal_width
    - sepal_area x petal_area
    - sepal_length x petal_length
- What are your takeaways?

In [None]:
train.head()

In [None]:
#creating sepal area by multiplying sepal length by sepal width
train['sepal_area'] = round(train.sepal_length * train.sepal_width, 1)
#creating petal are by multiplying petal length by petal width
train['petal_area'] = round(train.petal_length * train.petal_width, 1)
train.head()

In [None]:
#adjusting the size of the sub-scatterplots
plt.figure(figsize=(16, 9))

#scatterplot 1 
plt.subplot(2, 2, 1)
sns.scatterplot(data = train, x = 'sepal_width', y = 'sepal_length', hue = 'species')

#scatterplot 2
plt.subplot(2, 2, 2)
sns.scatterplot(data = train, x = 'petal_width', y = 'petal_length', hue = 'species')

#scatterplot 3
plt.subplot(2, 2, 3)
sns.scatterplot(data = train, x = 'sepal_area', y = 'petal_area', hue = 'species')

#scatterplot 4
plt.subplot(2, 2, 4)
sns.scatterplot(data = train, x = 'sepal_length', y = 'petal_length', hue = 'species')

plt.show()

__Takeaways__: 

- The scatterplot displaying sepal length vs sepal width does not seem to be a good determing factor for species because of the overlap between versicolor and virginica
- The other scatterplots seem to be a better identifier but there is some edge cases where we may predict versicolor or virginica incorrectly
- Lines can be drawn to distingiush between species on petal measurements than on sepal measurements

### Exercise 4: 
- Create a heatmap of each variable layering correlation coefficient on top.

In [None]:
#creating correlation values for the train dataframe
train_corr = train.corr()
#creating a mask for the upper triangle that repeats
mask = np.triu(np.ones_like(train_corr, dtype = bool))
#adjusting the size of the scatterplot
f, ax = plt.subplots(figsize = (10,6))
#creating the heatmap based of the correlation values of train,
#applying the mask,
#and annotating the correlation values respective to the data
sns.heatmap(train_corr, cmap ='coolwarm', center = 0, mask = mask, annot = True)
plt.show()

### Exercise 5: 
- Create a scatter matrix visualizing the interaction of each variable

In [None]:
#creating a scatter matrix using pandas
pd.plotting.scatter_matrix(train, figsize = (20, 10), alpha = .6)
plt.show()

In [None]:
#creating another scatter matrix using seaborns,
#identifying data by species using color
sns.pairplot(train, hue = 'species')
plt.show()

### Exercise 6: 
- Is the sepal length significantly different in virginica compared to versicolor? Run a statistical experiment to test this.

- Make sure to include a null hypothesis, alternative hypothesis, results, and summary.

__A. Plot Distribution__

In [None]:
#creating variables to use to distinguish variance
virginica_sep_len = train[train.species == 'virginica'].sepal_length
versicolor_sep_len = train[train.species == 'versicolor'].sepal_length

In [None]:
#creating a histogram for virginica sepal length
virginica_sep_len.hist()
plt.show()

In [None]:
#creating a histogram for versicolor sepal length
versicolor_sep_len.hist()
plt.show()

__B. Set Hypothesis__

$H_0$: sepal length of virginica == sepal length of versicolor

$H_a$: sepal length of virginica != sepal length of versicolor

__C. Set Alpha__

In [None]:
alpha = .05

__D. Verify Assumptions__

- normal distribution: YES
- large enough: YES
- variance: ??

In [None]:
#checking variance
train.groupby('species').sepal_length.var()

- variance: YES

In [None]:
#performing 2 sample t-test with equal variance
t, p = stats.ttest_ind(virginica_sep_len, versicolor_sep_len)
t, p

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")

- What is your takeaway from this statistical testing?

__Conclusion__: We reject the null hypothesis meaning that the sepal length of the virginica is significantly different than the sepal length of the versicolor. 

### Exercise 7: 
- Create any other visualizations and run any other statistical tests you think will be helpful in exploring this data set.

In [None]:
#creating a boxplot for petal area
sns.boxplot(data = train, y = 'petal_area', x = 'species')
#creating a boxplot for petal areaplt.show()

In [None]:
#creating a boxplot for sepal area
sns.boxplot(data = train, y = 'sepal_area', x = 'species')
plt.show()