In [None]:
# drop duplicate CaseOrder column
df = df.drop(columns='Unnamed: 0')

In [None]:
# create a dictionary of current column names mapping to desired column names
survey_dict = {'item1':'timely_responses', 'item2':'timely_fixes', 'item3':'timely_replacements', 
               'item4':'reliability', 'item5':'options', 'item6':'respectful_response', 
               'item7':'courteous_exchange', 'item8':'evidence_of_active_listening'}

# rename the column names based on survey_dict
df = df.rename(columns=survey_dict)

In [None]:
# create an empty dataframe to store short zip codes
incorrect_zips = pd.DataFrame()

# get a list of all unique zip codes
zips = list(df['Zip'].unique())

# append rows containing short zip codes to incorrect_zips dataframe
for i in zips:
    if len(str(i)) < 5:
        incorrect_zips = incorrect_zips.append(df.loc[df['Zip'] == i])

# function to fill zip codes less than 5 numbers long with leading zeros
def fill_zeros(x):
    if len(x) < 6:
        return x.zfill(5)
    else:
        pass
    
# convert zip code type to string to add zeros
df.Zip = df.Zip.astype(str)

# apply zero filling function to zip codes.
df.Zip = df.Zip.apply(fill_zeros)


 
* For null children I randomly assigned 0 or 1 child to the households with missing data. Over half of customers had one of these values for the number of children with 0 being the most common value and 1 being the median value, and so these were randomly assigned. This resulted in the loss of null values and a similar distribution of children values that we had to be gin with.

In [None]:
# change the NA values of children to 0 or 1 randomly
df.loc[df['Children'].isna(), 'Children'] = np.random.choice([0,1], len(df.loc[df['Children'].isna(), 'Children']))

In [None]:
# create random ages in age range
age_range = np.arange(start=18, stop=89)

# assign random ages in age range to NA ages in our data
df.loc[df.Age.isna(), 'Age'] = np.random.choice(age_range, len(df.loc[df.Age.isna(), 'Age']))

In [None]:
# find where Techie is NA and assign a random Yes or No value based on the current distribution of Yes and No values
df.loc[df['Techie'].isna(), 'Techie'] = np.random.choice(["Yes", "No"], len(df.loc[df['Techie'].isna(), 'Techie']), p=[0.167088, .832912])

# find where Phone is NA and assign a random Yes or No value based on the current distribution of Yes and No values
df.loc[df['Phone'].isna(), 'Phone'] = np.random.choice(['Yes', 'No'], len(df.loc[df['Phone'].isna(), 'Phone']), p=[.91, 0.09])

# find where TechSupport is NA and assign a random Yes or No value based on the current distribution of Yes and No values
df.loc[df['TechSupport'].isna(), 'TechSupport'] = np.random.choice(['Yes', 'No'], len(df.loc[df['TechSupport'].isna(), 'TechSupport']), p=[0.374514,0.625486])

In [None]:
# assign the median income to NA incomes
df.loc[df['Income'].isna(), 'Income'] = df['Income'].median()

# assign the median Tenure to NA Tenures
df.loc[df['Tenure'].isna(), 'Tenure'] = df['Tenure'].median()

# assign the median Bandwidth_GB_Year to NA Bandwidth_GB_Year
df.loc[df['Bandwidth_GB_Year'].isna(), 'Bandwidth_GB_Year'] = df['Bandwidth_GB_Year'].median()

In [None]:
# change the dataframe columns to more appropriate data types
df = df.astype( {'Customer_id':'string', 'Interaction':'string', 'City':'string', 'State':'string', 'County':'string', 'Zip':'string', 'Lat':float,
       'Lng':float, 'Population':int, 'Area':'category', 'Timezone':'category', 'Job':'category', 'Children':int, 'Age':int,
       'Education':'category', 'Employment':'category', 'Income':float, 'Marital':'category', 'Gender':'category', 'Churn':'category',
       'Outage_sec_perweek':float, 'Email':int, 'Contacts':int, 'Yearly_equip_failure':int,
       'Techie':'category', 'Contract':'category', 'Port_modem':'category', 'Tablet':'category', 'InternetService':'category',
       'Phone':'category', 'Multiple':'category', 'OnlineSecurity':'category', 'OnlineBackup':'category',
       'DeviceProtection':'category', 'TechSupport':'category', 'StreamingTV':'category', 'StreamingMovies':'category',
       'PaperlessBilling':'category', 'PaymentMethod':'category', 'Tenure':float, 'MonthlyCharge':float,
       'Bandwidth_GB_Year':float, 'timely_responses':int, 'timely_fixes':int, 'timely_replacements':int, 'reliability':int, 'options':int,
       'respectful_response':int, 'courteous_exchange':int, 'evidence_of_active_listening':int}, copy=False)

# return our new dataframe of datatypes
get_dtypes(df)

In [None]:
# get a list of all possible areas
areas = df['Area'].unique()

# assign the mean populations in each area to the area in a dictionary
area_dict = {}
for i in areas:
    area_dict[i] = df.loc[df['Area'] == i]['Population'].mean()

# assign the mean area populations to the missing population values based on their area in the dictionary.
for k, v in area_dict.items():
    df.loc[(df['Population'] == 0) & (df['Area'] == k), 'Population'] = v

In [None]:
# assign 0 to negative values for outage seconds per week
df.loc[df['Outage_sec_perweek'] < 0, 'Outage_sec_perweek'] = 0