In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

#import my modules
import acquire as a
import wrangle as w

## Exercises II

As a customer analyst for Telco, you want to know who has spent the most money with the company over their lifetime. You have monthly charges and tenure, so you think you will be able to use those two attributes as features to estimate total charges. You need to do this within an average of $5.00 per customer.

In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Telco data from the telco_churn database in the Codeup database server.

**1. Acquire customer_id, monthly_charges, tenure, and total_charges from the telco_churn database for all customers with a 2-year contract.**

In [None]:
#acquire my df using my function but this gives me all the columns 
df = a.get_telco()

In [None]:
#check info
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
#checking the information in this column
df.contract_type_id.value_counts()

In [None]:
df.contract_type.value_counts()

In [None]:
#getting only the customers who have 2 year contract using the condition df.contract_type_id == 3
telco_df = df[['customer_id', 'monthly_charges', 'tenure', 'total_charges']][df.contract_type_id == 3]
telco_df.head()

In [None]:
telco_df.shape

____________

In [None]:
#other way to do it is if I have a generic function so I can use a different query
query = """
        SELECT 
            customer_id, 
            monthly_charges, 
            tenure, 
            total_charges
        FROM customers
        WHERE contract_type_id = 3;
        """

df2 = a.get_data_from_sql('telco_churn', query)

In [None]:
df2.head(1)

In [None]:
df2.shape

In [None]:
df2.info()

In [None]:
df2.describe()

___________________

**Takeaways**
- customer_id and otal_charges are object data type. for total_charges should be float type. 
- tenure has a minimum value of 0. that means new customers
- I don't seem to have any Null values

_____________________

**2. Using your acquired Telco data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaningful; remember to document your process and decisions using markdown and code commenting where helpful.**

In [None]:
#checking information of the columns. we noticed  total_charges is object type
telco_df.info()

In [None]:
#drop duplicates
telco_df = telco_df.drop_duplicates()

In [None]:
# Find the total number of Null values in each column of our DataFrame.
telco_df.isnull().sum()

In [None]:
# Check for any Null values in each column of our DataFrame.

telco_df.isnull().any()

In [None]:
# Return the names for any columns in our DataFrame with any Null values.

df.columns[df.isnull().any()]

In [None]:
#trying to change total_charges to float
#telco_df['total_charges'].astype('float')
#when we run the code to convert to float, we get an error (could not convert string to float: ''), there is a space 
#in some values

In [None]:
#these are the observations that have space in total charges
#these customers are new customers becuase they have a tenure of 0 and have not done their first payment. 
telco_df[telco_df['total_charges']== ' ']

In [None]:

# I wll add a '0' only to these customers
#
telco_df[telco_df['total_charges']== ' '] = telco_df[telco_df['total_charges']== ' '].replace(' ','0')

In [None]:
#checking the info
telco_df[telco_df['total_charges']== '0']

In [None]:
#now I can convert total_charges to float
telco_df['total_charges']= telco_df['total_charges'].astype('float')

In [None]:
telco_df.info()

In [None]:
telco_df.shape

____________

What if the total charges that are 0 I  change them to 1 since they were probably customers for about a month.

In [None]:
df2.tenure.value_counts().sort_index().head()

In [None]:
# Replace any tenures of 0 with 1.

df2.tenure = df2.tenure.replace(0, 1)

In [None]:

# Validate my tenure count for value 1; I went from two to 12.

df2.tenure.value_counts().sort_index().head()

In [None]:
# These observations also need total_charges handled. 
# The other tenure 1 observations have same monthly and total charges.

df2[df2.tenure == 1]

In [None]:
# Replace the blank total_charges with the monthly_charge for tenure == 1.

df.total_charges = np.where(df.total_charges==' ', df.monthly_charges, df.total_charges)

In [None]:
# Validate my changes.

df[df.tenure == 1]

**3. End with a wrangle.py file that contains the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe with no missing values. Name your final function wrangle_telco.**

In [None]:

# Handle the data type for total_charges.

df.total_charges = df.total_charges.astype(float)

In [None]:
# Validate my data type conversion.

df.info()

____________

In [None]:
#creating my functions

In [None]:
def clean_telco(df):
    ''''
    This function will get customer_id, monthly_charges, tenure, and total_charges 
    from the previously acquired telco df, for all customers with a 2-year contract.
    drop any duplicate observations, 
    conver total_charges to a float type.
    return cleaned telco DataFrame
    '''
    #getting only the customers who have 2 year contract using the condition df.contract_type_id == 3
    telco_df = df[['customer_id', 'monthly_charges', 'tenure', 'total_charges']][df.contract_type_id == 3]
    #drop duplicates
    telco_df = telco_df.drop_duplicates()
    # add a '0' only to the columns that have " "
    telco_df[telco_df['total_charges']== ' '] = telco_df[telco_df['total_charges']== ' '].replace(' ','0')
    # convert total_charges to float
    telco_df['total_charges']= telco_df['total_charges'].astype('float')
        
    return telco_df

In [None]:
def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
    return train, validate, test

In [None]:
def wrangle_telco():
    ''''
    This function will acquire telco db using get_telco function. then it will use another
    function named  clean_telco that create a new df only with  customer_id, monthly_charges, tenure, and total_charges 
    from the previously acquired telco df, this new df will contain only customers with a 2-year contract.
    drop any duplicate observations, 
    conver total_charges to a float type.
    return cleaned telco DataFrame
    '''
    df = acquire.get_telco()
    telco_df = clean_telco(df)
    return telco_df
    

In [None]:
#using my functions that are in wrangle.py

In [2]:
df = w.wrangle_telco()

In [3]:
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0016-QLJIS,90.45,65,5957.9
1,0017-DINOC,45.2,54,2460.55
2,0019-GFNTW,45.05,56,2560.1
3,0056-EPFBG,39.4,20,825.4
4,0078-XZMHT,85.15,72,6316.2


In [4]:
df.shape

(1695, 4)

In [5]:
train, validate, test = w.split_data(df)

train -> (949, 4)
validate -> (407, 4)
test -> (339, 4)


## Exercises III - Challenge

Let's set up an example scenario as perspective for our regression exercises using the Zillow dataset.

As a Codeup data science graduate, you want to show off your skills to the Zillow data science team in hopes of getting an interview for a position you saw pop up on LinkedIn. You thought it might look impressive to build an end-to-end project in which you use some of their Kaggle data to predict property values using some of their available features; who knows, you might even do some feature engineering to blow them away. Your goal is to predict the values of single unit properties using the obervations from 2017.

In these exercises, you will complete the first step toward the above goal: acquire and prepare the necessary Zillow data from the zillow database in the Codeup database server.

**1. Acquire bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, and fips from the zillow database for all 'Single Family Residential' properties.**

In [None]:
zillow_df= a.get_new_zillow()

In [None]:
#I'm doing this because it takes a long time to get the db from sql
z_df = zillow_df

In [None]:
zillow_df.head()

In [None]:

zillow_df.shape

In [None]:
zillow_df.info()

In [None]:
zillow_df.info(null_counts=True)

In [None]:
df.describe().T

2. Using your acquired Zillow data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaninful; remember to document your process and decisions using markdown and code commenting where helpful.

In [None]:
# checking the info 
zillow_df.info()

In [None]:
# this shows us non-nulls
zillow_df.info(null_counts=True)

In [None]:
def missing_values_table(df):
    '''
    this function takes a dataframe as input and will output metrics for missing values, and the percent of that column that has missing values
    '''
        # Total missing values
    mis_val = df.isnull().sum()
        # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
        #total of duplicated
    dup = df.duplicated().sum()  
        # Percentage of missing values
    dup_percent = 100 * dup / len(df)
        # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
           "There are " + str(mis_val_table_ren_columns.shape[0]) +
           " columns that have missing values.")
    print (f"There are {dup} duplicate rows that represents {round(dup_percent, 2)}% of total Values")
        # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [None]:
missing_values_table(zillow_df)

In [None]:
zillow_df.duplicated().sum()

In [None]:
len(zillow_df)

In [None]:
# Find the total number of Null values in each column of our DataFrame.
telco_df.isnull().sum()

3. Store all of the necessary functions to automate your process from acquiring the data to returning a cleaned dataframe witn no missing values in your wrangle.py file. Name your final function wrangle_zillow.