In [1]:
import numpy as np
import pandas as pd
import env
import acquire
import prepare

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [2]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_googlesheet = pd.read_csv(csv_export_url)
df_googlesheet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def get_conn(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [4]:
df = pd.read_sql('SELECT * FROM passengers', get_conn('titanic_db'))
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### 1. Create a pandas dataframe, df_iris, from this data.

In [5]:
df_iris = data('iris')

In [6]:
df_iris.sample(10)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
91,5.5,2.6,4.4,1.2,versicolor
150,5.9,3.0,5.1,1.8,virginica
101,6.3,3.3,6.0,2.5,virginica
149,6.2,3.4,5.4,2.3,virginica
125,6.7,3.3,5.7,2.1,virginica
132,7.9,3.8,6.4,2.0,virginica
99,5.1,2.5,3.0,1.1,versicolor
66,6.7,3.1,4.4,1.4,versicolor
94,5.0,2.3,3.3,1.0,versicolor
55,6.5,2.8,4.6,1.5,versicolor


In [7]:
# print the first 3 rows
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [8]:
# print the number of rows and columns (shape)
df_iris.shape

(150, 5)

In [9]:
# print the column names
df_iris.columns.to_list()

['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']

In [10]:
# print the data type of each column
df_iris.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

In [11]:
# print the summary statistics for each of the numeric variables. 
# Would you recommend rescaling the data based on these statistics?

df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [12]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [13]:
filtered = df_iris.dtypes[df_iris.dtypes == np.object]
filtered

Species    object
dtype: object

### 2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

In [14]:
# assign the first 100 rows to a new dataframe, df_excel_sample
df_excel = pd.read_excel('Spreadsheets_Exercises.xlsx')

In [15]:
# print the number of rows of your original dataframe
print(f'The number of rows in the dataframe: {df_excel.shape[0]}')

The number of rows in the dataframe: 7049


In [16]:
# print the first 5 column names
df_excel.columns[:5]

Index(['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents'], dtype='object')

In [17]:
# print the column names that have a data type of object
[df_excel.dtypes[df_excel.dtypes == np.object]]

[customer_id     object
 gender          object
 partner         object
 dependents      object
 payment_type    object
 churn           object
 dtype: object]

In [18]:
# compute the range for each of the numeric variables.
df_excel.describe()

Unnamed: 0,is_senior_citizen,phone_service,internet_service,contract_type,monthly_charges,total_charges
count,7049.0,7049.0,7049.0,7049.0,7049.0,7038.0
mean,0.162009,1.324585,1.222585,0.690878,64.747014,2283.043883
std,0.368485,0.642709,0.779068,0.833757,30.09946,2266.521984
min,0.0,0.0,0.0,0.0,18.25,18.8
25%,0.0,1.0,1.0,0.0,35.45,401.5875
50%,0.0,1.0,1.0,0.0,70.35,1397.1
75%,0.0,2.0,2.0,1.0,89.85,3793.775
max,1.0,2.0,2.0,2.0,118.75,8684.8


### 3. Read the data from this google sheet into a dataframe, df_google

In [19]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_googlesheet = pd.read_csv(csv_export_url)

In [20]:
# print the first 3 rows
df_googlesheet.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [21]:
# print the number of rows and columns
df_googlesheet.shape

(891, 12)

In [22]:
# print the column names
df_googlesheet.columns.to_list()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [23]:
# print the data type of each column
df_googlesheet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [24]:
# print the summary statistics for each of the numeric variables
df_googlesheet.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [25]:
df_gs = df_googlesheet.select_dtypes(include='object')

In [26]:
column_values = df_gs.values.ravel()

In [27]:
unique_values = pd.unique(column_values)
unique_values

array(['Braund, Mr. Owen Harris', 'male', 'A/5 21171', ..., 'C148',
       'Dooley, Mr. Patrick', '370376'], dtype=object)

In [28]:
titanic = pd.DataFrame(acquire.get_titanic_data())
    

- CodeUp_db successfully accessed ...
- titanic_db SQL query successful ...


In [29]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [30]:
iris = acquire.get_iris_data()

- CodeUp_db successfully accessed ...
- iris_db SQL query successful ...


In [31]:
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa
