In [1]:
import pandas as pd
import numpy as np
from pydataset import data

import env
import util
import acquire


### Classification: Acquire data


#### Goals

-Data you wish to use in analysis will be stored in a variety of sources. In this lesson, we will review importing data from a csv and via mySQL, and we will also learn how to import data from our local clipboard, a google sheets document, and from an MS Excel file. We will then select one source to use as we continue through the rest of this module.

#### Methods of Data Acquisition

-read_clipboard: When you have data copied to your clipboard, you can use pandas to read it into a data frame with pd.read_clipboard. This can be useful for quickly transferring data to/from a spreadsheet.

-read_excel: This function can be used to create a data frame based on the contents of an Excel spreadsheet.

-read_csv: Read from a local csv, or from a the cloud (Google Sheets or AWS S3).

-read_sql(sql_query, connection_url): Read data using a SQL query to a database. You must have the required drivers installed, and a specially formatted url string must be provided.

    # To talk to a mysql database:
    python -m pip install pymysql mysql-connector
    #the connection url string:
    mysql+pymysql://USER:PASSWORD@HOST/DATABASE_NAME


1. Use a python module containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

    -print the first 3 rows

    -print the number of rows and columns (shape)

    -print the column names

    -print the data type of each column

    -print the summary statistics for each of the numeric variables. Would you recommend rescaling the data based on these statistics?

In [2]:
df_iris = data("iris")

In [3]:
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [4]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [5]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     150 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [6]:
df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


2. Read Table1_CustDetails the excel module dataset, Excel_Exercises.xlsx, into a dataframe, df_excel

    -assign the first 100 rows to a new dataframe, df_excel_sample

    -print the number of rows of your original dataframe

    -print the first 5 column names

    -print the column names that have a data type of object

    -compute the range for each of the numeric variables.



In [7]:
df_cust = pd.read_excel("my_telco_churn.xlsx", sheet_name="Table1_CustDetails")

In [10]:
df_excel_sample = df_cust.head(100)

In [11]:
df_cust.shape[0]

7049

In [12]:
list(df_cust.columns)[:5]

['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents']

In [13]:
df_cust.dtypes

customer_id           object
gender                object
is_senior_citizen      int64
partner               object
dependents            object
phone_service          int64
internet_service       int64
contract_type          int64
payment_type          object
monthly_charges      float64
total_charges        float64
tenure               float64
churn                 object
Unnamed: 13          float64
phone_service.1       object
(Multiple Items)      object
dtype: object

In [19]:
df_cust = df_cust.astype({"is_senior_citizen": "object", "phone_service": "object", "internet_service": "object", "contract_type": "object"})

df_cust.dtypes

customer_id           object
gender                object
is_senior_citizen     object
partner               object
dependents            object
phone_service         object
internet_service      object
contract_type         object
payment_type          object
monthly_charges      float64
total_charges        float64
tenure               float64
churn                 object
Unnamed: 13          float64
phone_service.1       object
(Multiple Items)      object
dtype: object

In [20]:
df_cust.describe()

Unnamed: 0,monthly_charges,total_charges,tenure,Unnamed: 13
count,7049.0,7038.0,7049.0,0.0
mean,64.747014,2283.043883,32.379866,
std,30.09946,2266.521984,24.595524,
min,18.25,18.8,0.0,
25%,35.45,401.5875,8.733456,
50%,70.35,1397.1,28.683425,
75%,89.85,3793.775,55.229399,
max,118.75,8684.8,79.341772,


In [21]:
#df_cust.dtypes[df_cust.dtypes == "object"]
df_cust_obj = df_cust.select_dtypes("object")
df_cust_obj.head()

Unnamed: 0,customer_id,gender,is_senior_citizen,partner,dependents,phone_service,internet_service,contract_type,payment_type,churn,phone_service.1,(Multiple Items)
0,7569-NMZYQ,Female,0,Yes,Yes,2,2,2,Bank transfer (automatic),No,internet_service,(Multiple Items)
1,8984-HPEMB,Female,0,No,No,2,2,2,Electronic check,No,,
2,5734-EJKXG,Female,0,No,No,2,2,1,Electronic check,No,Row Labels,
3,5989-AXPUC,Female,0,Yes,No,2,2,2,Mailed check,No,0-1,
4,8199-ZLLSA,Male,0,No,No,2,2,1,Bank transfer (automatic),Yes,1-2,


In [27]:
#df_cust.dtypes[df_cust.dtypes != "object"]
df_cust_num = df_cust.select_dtypes(["int64", "float64"])

df_cust_num.max() - df_cust_num.min()

monthly_charges     100.500000
total_charges      8666.000000
tenure               79.341772
Unnamed: 13                NaN
dtype: float64

3. Read the data from this google sheet into a dataframe, df_google

    -print the first 3 rows

    -print the number of rows and columns

    -print the column names

    -print the data type of each column

    -print the summary statistics for each of the numeric variables

    -print the unique values for each of your categorical variables

#### Testing new funcs

In [None]:
sheet_url = "https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357"

csv_export_url = sheet_url.replace("/edit#gid=", '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)
df_google.head(3)

In [None]:
df_google.shape

In [None]:
df_google.columns

In [None]:
df_google.info()

In [None]:
df_google.describe()

In [None]:
df_google.head(2)

In [None]:
# Transform data
df_google = df_google.astype({"Name": "category", "Survived": "category", "Pclass": "category", "Sex": "category", "Cabin": "category", "Ticket": "category", "Embarked": "category"})
df_google.dtypes

In [None]:
df_google.Survived.unique()

In [None]:
df_google.Pclass.unique()

In [None]:
df_google.Name.unique()

In [None]:
df_google.Sex.unique()

In [None]:
df_google.Ticket.unique()

In [None]:
df_google.Cabin.unique()

In [None]:
df_google.Embarked.unique()

In a new python module, acquire.py:

1. get_titanic_data: returns the titanic data from the codeup data science database as a pandas data frame.

2. get_iris_data: returns the data from the iris_db on the codeup data science database as a pandas data frame. The returned data frame should include the actual name of the species in addition to the species_ids.

In [None]:
acquire.get_titanic_data().head()
    

In [None]:
acquire.get_iris_data().head()