In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

#import my modules
import acquire as a
import wrangle as w

## Exercises II

As a customer analyst for Telco, you want to know who has spent the most money with the company over their lifetime. You have monthly charges and tenure, so you think you will be able to use those two attributes as features to estimate total charges. You need to do this within an average of $5.00 per customer.

In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Telco data from the telco_churn database in the Codeup database server.

**1. Acquire customer_id, monthly_charges, tenure, and total_charges from the telco_churn database for all customers with a 2-year contract.**

In [2]:
#acquire my df using my function but this gives me all the columns 
df = a.get_telco()

In [3]:
#check info
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,3,0016-QLJIS,Female,0,Yes,Yes,65,Yes,...,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,4,1,3,0017-DINOC,Male,0,No,No,54,No,...,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,3,1,3,0019-GFNTW,Female,0,No,No,56,No,...,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,4,1,3,0056-EPFBG,Male,0,Yes,Yes,20,No,...,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,3,1,3,0078-XZMHT,Male,0,Yes,No,72,Yes,...,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
payment_type_id,7043.0,2.315633,1.148907,1.0,1.0,2.0,3.0,4.0
internet_service_type_id,7043.0,1.872923,0.737796,1.0,1.0,2.0,2.0,3.0
contract_type_id,7043.0,1.690473,0.833755,1.0,1.0,1.0,2.0,3.0
senior_citizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
monthly_charges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


In [6]:
#checking the information in this column
df.contract_type_id.value_counts()

1    3875
3    1695
2    1473
Name: contract_type_id, dtype: int64

In [7]:
df.contract_type.value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: contract_type, dtype: int64

In [8]:
#getting only the customers who have 2 year contract using the condition df.contract_type_id == 3
telco_df = df[['customer_id', 'monthly_charges', 'tenure', 'total_charges']][df.contract_type_id == 3]
telco_df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0016-QLJIS,90.45,65,5957.9
1,0017-DINOC,45.2,54,2460.55
2,0019-GFNTW,45.05,56,2560.1
3,0056-EPFBG,39.4,20,825.4
4,0078-XZMHT,85.15,72,6316.2


In [9]:
telco_df.shape

(1695, 4)

____________

In [10]:
#other way to do it is if I have a generic function so I can use a different query
query = """
        SELECT 
            customer_id, 
            monthly_charges, 
            tenure, 
            total_charges
        FROM customers
        WHERE contract_type_id = 3;
        """

df2 = a.get_data_from_sql('telco_churn', query)

In [11]:
df2.head(1)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25


In [12]:
df2.shape

(1695, 4)

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 53.1+ KB


In [14]:
df2.describe()

Unnamed: 0,monthly_charges,tenure
count,1695.0,1695.0
mean,60.770413,56.735103
std,34.678865,18.209363
min,18.4,0.0
25%,24.025,48.0
50%,64.35,64.0
75%,90.45,71.0
max,118.75,72.0


___________________

**Takeaways**
- customer_id and otal_charges are object data type. for total_charges should be float type. 
- tenure has a minimum value of 0. that means new customers
- I don't seem to have any Null values

_____________________

**2. Using your acquired Telco data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaningful; remember to document your process and decisions using markdown and code commenting where helpful.**

In [15]:
#checking information of the columns. we noticed  total_charges is object type
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1695 entries, 0 to 6154
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 66.2+ KB


In [16]:
#drop duplicates
telco_df = telco_df.drop_duplicates()

In [17]:
# Find the total number of Null values in each column of our DataFrame.
telco_df.isnull().sum()

customer_id        0
monthly_charges    0
tenure             0
total_charges      0
dtype: int64

In [18]:
# Check for any Null values in each column of our DataFrame.

telco_df.isnull().any()

customer_id        False
monthly_charges    False
tenure             False
total_charges      False
dtype: bool

In [19]:
# Return the names for any columns in our DataFrame with any Null values.

df.columns[df.isnull().any()]

Index([], dtype='object')

In [20]:
#trying to change total_charges to float
#telco_df['total_charges'].astype('float')
#when we run the code to convert to float, we get an error (could not convert string to float: ''), there is a space 
#in some values

In [21]:
#these are the observations that have space in total charges
#these customers are new customers becuase they have a tenure of 0 and have not done their first payment. 
telco_df[telco_df['total_charges']== ' ']

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
85,1371-DWPAZ,56.05,0,
156,2775-SEFEE,61.9,0,
236,4075-WKNIU,73.35,0,
255,4472-LVYGI,52.55,0,
339,5709-LVOEQ,80.85,0,
5681,2520-SGTTA,20.0,0,
5717,3115-CZMZD,20.25,0,
5727,3213-VVOLG,25.35,0,
5798,4367-NUYAO,25.75,0,
6007,7644-OMVMY,19.85,0,


In [22]:

# I wll add a '0' only to these customers
#
telco_df[telco_df['total_charges']== ' '] = telco_df[telco_df['total_charges']== ' '].replace(' ','0')

In [23]:
#checking the info
telco_df[telco_df['total_charges']== '0']

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
85,1371-DWPAZ,56.05,0,0
156,2775-SEFEE,61.9,0,0
236,4075-WKNIU,73.35,0,0
255,4472-LVYGI,52.55,0,0
339,5709-LVOEQ,80.85,0,0
5681,2520-SGTTA,20.0,0,0
5717,3115-CZMZD,20.25,0,0
5727,3213-VVOLG,25.35,0,0
5798,4367-NUYAO,25.75,0,0
6007,7644-OMVMY,19.85,0,0


In [24]:
#now I can convert total_charges to float
telco_df['total_charges']= telco_df['total_charges'].astype('float')

In [25]:
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1695 entries, 0 to 6154
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 66.2+ KB


In [26]:
telco_df.shape

(1695, 4)

____________

What if the total charges that are 0 I  change them to 1 since they were probably customers for about a month.

In [27]:
df2.tenure.value_counts().sort_index().head()

0    10
1     2
2     1
3     3
4     4
Name: tenure, dtype: int64

In [28]:
# Replace any tenures of 0 with 1.

df2.tenure = df2.tenure.replace(0, 1)

In [29]:

# Validate my tenure count for value 1; I went from two to 12.

df2.tenure.value_counts().sort_index().head()

1    12
2     1
3     3
4     4
5     1
Name: tenure, dtype: int64

In [30]:
# These observations also need total_charges handled. 
# The other tenure 1 observations have same monthly and total charges.

df2[df2.tenure == 1]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
188,1099-GODLO,20.35,1,20.35
234,1371-DWPAZ,56.05,1,
416,2520-SGTTA,20.0,1,
453,2775-SEFEE,61.9,1,
505,3115-CZMZD,20.25,1,
524,3213-VVOLG,25.35,1,
678,4075-WKNIU,73.35,1,
716,4367-NUYAO,25.75,1,
726,4472-LVYGI,52.55,1,
941,5709-LVOEQ,80.85,1,


In [31]:
# Replace the blank total_charges with the monthly_charge for tenure == 1.

df.total_charges = np.where(df.total_charges==' ', df.monthly_charges, df.total_charges)

In [32]:
# Validate my changes.

df[df.tenure == 1]

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
803,2,1,2,3069-SSVSN,Female,0,No,No,1,No,...,No,No,No,No,25.85,25.85,No,One year,DSL,Mailed check
1139,4,1,2,8985-OOPOS,Female,0,No,No,1,Yes,...,Yes,Yes,Yes,No,74.10,74.1,No,One year,DSL,Credit card (automatic)
1201,1,1,1,0023-HGHWL,Male,1,No,No,1,No,...,No,No,No,Yes,25.10,25.1,Yes,Month-to-month,DSL,Electronic check
1202,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,No,...,No,No,No,No,30.50,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)
1205,2,1,1,0082-LDZUE,Male,0,No,No,1,Yes,...,No,No,No,Yes,44.30,44.3,No,Month-to-month,DSL,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7025,2,3,1,9629-NHXFW,Female,0,Yes,Yes,1,Yes,...,No internet service,No internet service,No internet service,Yes,19.40,19.4,No,Month-to-month,,Mailed check
7027,4,3,1,9708-KFDBY,Male,0,No,No,1,Yes,...,No internet service,No internet service,No internet service,No,20.55,20.55,No,Month-to-month,,Credit card (automatic)
7028,2,3,1,9753-OYLBX,Female,0,No,No,1,Yes,...,No internet service,No internet service,No internet service,No,20.50,20.5,No,Month-to-month,,Mailed check
7038,2,3,1,9962-BFPDU,Female,0,Yes,Yes,1,Yes,...,No internet service,No internet service,No internet service,No,20.05,20.05,No,Month-to-month,,Mailed check


**3. End with a wrangle.py file that contains the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe with no missing values. Name your final function wrangle_telco.**

In [33]:

# Handle the data type for total_charges.

df.total_charges = df.total_charges.astype(float)

In [34]:
# Validate my data type conversion.

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

____________

In [35]:
#creating my functions

In [36]:
def clean_telco(df):
    ''''
    This function will get customer_id, monthly_charges, tenure, and total_charges 
    from the previously acquired telco df, for all customers with a 2-year contract.
    drop any duplicate observations, 
    conver total_charges to a float type.
    return cleaned telco DataFrame
    '''
    #getting only the customers who have 2 year contract using the condition df.contract_type_id == 3
    telco_df = df[['customer_id', 'monthly_charges', 'tenure', 'total_charges']][df.contract_type_id == 3]
    #drop duplicates
    telco_df = telco_df.drop_duplicates()
    # add a '0' only to the columns that have " "
    telco_df[telco_df['total_charges']== ' '] = telco_df[telco_df['total_charges']== ' '].replace(' ','0')
    # convert total_charges to float
    telco_df['total_charges']= telco_df['total_charges'].astype('float')
        
    return telco_df

In [37]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
    return train, validate, test

In [38]:
def wrangle_telco():
    ''''
    This function will acquire telco db using get_telco function. then it will use another
    function named  clean_telco that create a new df only with  customer_id, monthly_charges, tenure, and total_charges 
    from the previously acquired telco df, this new df will contain only customers with a 2-year contract.
    drop any duplicate observations, 
    conver total_charges to a float type.
    return cleaned telco DataFrame
    '''
    df = acquire.get_telco()
    telco_df = clean_telco(df)
    return telco_df
    

In [39]:
#using my functions that are in wrangle.py

In [40]:
df = w.wrangle_telco()

In [41]:
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0016-QLJIS,90.45,65,5957.9
1,0017-DINOC,45.2,54,2460.55
2,0019-GFNTW,45.05,56,2560.1
3,0056-EPFBG,39.4,20,825.4
4,0078-XZMHT,85.15,72,6316.2


In [42]:
df.shape

(1695, 4)

In [43]:
train, validate, test = w.split_data(df)

train -> (949, 4)
validate -> (407, 4)
test -> (339, 4)


## Exercises III - Challenge

Let's set up an example scenario as perspective for our regression exercises using the Zillow dataset.

As a Codeup data science graduate, you want to show off your skills to the Zillow data science team in hopes of getting an interview for a position you saw pop up on LinkedIn. You thought it might look impressive to build an end-to-end project in which you use some of their Kaggle data to predict property values using some of their available features; who knows, you might even do some feature engineering to blow them away. Your goal is to predict the values of single unit properties using the obervations from 2017.

In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Zillow data from the zillow database in the Codeup database server.

**1. Acquire bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, and fips from the zillow database for all 'Single Family Residential' properties.**

In [44]:
zillow_df= a.get_new_zillow()

In [45]:
#I'm doing this because it takes a long time to get the db from sql
z_df = zillow_df

In [46]:
zillow_df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [47]:

zillow_df.shape

(2152863, 7)

In [48]:
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 115.0 MB


In [49]:
zillow_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
dtypes: float64(7)
memory usage: 115.0 MB


In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
monthly_charges,1695.0,60.770413,34.678865,18.4,24.025,64.35,90.45,118.75
tenure,1695.0,56.735103,18.209363,0.0,48.0,64.0,71.0,72.0
total_charges,1695.0,3706.934336,2579.517834,0.0,1269.675,3593.8,5988.8,8672.45


2. Using your acquired Zillow data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaninful; remember to document your process and decisions using markdown and code commenting where helpful.

In [51]:
# checking the info 
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   bedroomcnt                    float64
 1   bathroomcnt                   float64
 2   calculatedfinishedsquarefeet  float64
 3   taxvaluedollarcnt             float64
 4   yearbuilt                     float64
 5   taxamount                     float64
 6   fips                          float64
dtypes: float64(7)
memory usage: 115.0 MB


In [52]:
# this shows us non-nulls
zillow_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
dtypes: float64(7)
memory usage: 115.0 MB


In [53]:
## this function was shared by a classmate . I add total of duplicated values
def miss_dup_values(df):
    '''
    this function takes a dataframe as input and will output metrics for missing values and duplicated rows, 
    and the percent of that column that has missing values and duplicated rows
    '''
        # Total missing values
    mis_val = df.isnull().sum()
        # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
        #total of duplicated
    dup = df.duplicated().sum()  
        # Percentage of missing values
    dup_percent = 100 * dup / len(df)
        # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
           "There are " + str(mis_val_table_ren_columns.shape[0]) +
           " columns that have missing values.")
    print( "  ")
    print (f"** There are {dup} duplicate rows that represents {round(dup_percent, 2)}% of total Values**")
        # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [54]:
#using the function above
miss_dup_values(zillow_df)

Your selected dataframe has 7 columns.
There are 6 columns that have missing values.
  
** There are 11644 duplicate rows that represents 0.54% of total Values**


Unnamed: 0,Missing Values,% of Total Values
yearbuilt,9337,0.4
calculatedfinishedsquarefeet,8484,0.4
taxamount,4442,0.2
taxvaluedollarcnt,493,0.0
bedroomcnt,11,0.0
bathroomcnt,11,0.0


In [55]:
# this shows us non-nulls
zillow_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
dtypes: float64(7)
memory usage: 115.0 MB


In [56]:
zillow_df['bedroomcnt'].nunique()

19

In [57]:
zillow_df['bedroomcnt'].value_counts().sort_index()

0.0      13187
1.0      23166
2.0     335473
3.0     964298
4.0     634289
5.0     150866
6.0      25166
7.0       4807
8.0       1107
9.0        291
10.0       121
11.0        34
12.0        12
13.0        16
14.0         7
15.0         6
16.0         2
18.0         3
25.0         1
Name: bedroomcnt, dtype: int64

In [58]:
zillow_df[zillow_df['bedroomcnt']== 0].head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
5,0.0,0.0,,124.0,,,6037.0


In [59]:
zillow_df.shape

(2152863, 7)

In [60]:
# drop duplicates
zillow_df = zillow_df.drop_duplicates()
zillow_df.shape

(2141219, 7)

In [61]:
#drop nulls
zillow_df = zillow_df.dropna(how='any',axis=0)

In [62]:
#size after the drops
zillow_df.shape

(2130214, 7)

In [63]:
missing_values_table(zillow_df)

NameError: name 'missing_values_table' is not defined

In [None]:
#confirm that we dond have nulls
zillow_df.info(null_counts=True)

In [None]:
zillow_df.describe().T

**takeaways**
there are houses with ) bedrooms, bathrooms and calculatedfinishedsquarefeet = 1 ??


In [None]:
#let see value counts in each column
cols = zillow_df.columns.to_list()
for col in cols:
    print(col)
    print(zillow_df[col].value_counts().sort_index().head(26))

In [None]:
#bedrooms, yearbuilt and fips can be converted to int
zillow_df[['bedroomcnt', 'yearbuilt', 'fips']] = zillow_df[['bedroomcnt', 'yearbuilt', 'fips']].astype(int)

In [None]:
zillow_df.info()

In [None]:
zillow_df[zillow_df.calculatedfinishedsquarefeet < 10]

In [None]:
#this makes no sense. a house with more that one bedroom and has less than 10 squarefeet

In [None]:
zillow_df.nunique()

In [None]:
zillow_df[zillow_df.calculatedfinishedsquarefeet < 500].count()

In [None]:
zillow_df[(zillow_df.bedroomcnt == 0) & (zillow_df.bathroomcnt == 0)].sort_values('calculatedfinishedsquarefeet').head(50)

**takeaways**
- dropped duplicated rows
- droped rows that have nulls
- there are houses with 0 bedrooms, bathrooms and calculatedfinishedsquarefeet = 1,2, ??



3. Store all of the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe witn no missing values in your wrangle.py file. Name your final function wrangle_zillow.