# Replacing Bad Values

This is US wind turbine data. The numeric fields use -9999 as a null value for missing data. 
Using -9999 as a null value in numeric fields will cause big problems for any summary statistics like totals, means, etc,
we should change that to something else, like np.NaN which Pandas sum and mean functions will automatically filter out.  
You can see that the means for before and after replacing -9999 with np.NaN are very different. 
You can use Janitor's find_replace to easily replace them. 

In [1]:
import pandas as pd
import janitor
import numpy as np

In [64]:
wind = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-11-06/us_wind.csv")

In [65]:
wind.head()

Unnamed: 0,case_id,faa_ors,faa_asn,usgs_pr_id,t_state,t_county,t_fips,p_name,p_year,p_tnum,...,t_hh,t_rd,t_rsa,t_ttlh,t_conf_atr,t_conf_loc,t_img_date,t_img_srce,xlong,ylat
0,3073429,missing,missing,4960,CA,Kern County,6029,251 Wind,1987,194,...,-9999.0,-9999.0,-9999.0,-9999.0,2,3,1/1/2012,NAIP,-118.360725,35.083778
1,3071522,missing,missing,4997,CA,Kern County,6029,251 Wind,1987,194,...,-9999.0,-9999.0,-9999.0,-9999.0,2,3,1/1/2012,NAIP,-118.361168,35.081512
2,3073425,missing,missing,4957,CA,Kern County,6029,251 Wind,1987,194,...,-9999.0,-9999.0,-9999.0,-9999.0,2,3,1/1/2012,NAIP,-118.36042,35.084709
3,3071569,missing,missing,5023,CA,Kern County,6029,251 Wind,1987,194,...,-9999.0,-9999.0,-9999.0,-9999.0,2,3,7/31/2016,Digital Globe,-118.364029,35.079418
4,3005252,missing,missing,5768,CA,Kern County,6029,251 Wind,1987,194,...,-9999.0,-9999.0,-9999.0,-9999.0,2,3,11/23/2017,Digital Globe,-118.354286,35.085594


In [66]:
wind.t_hh.mean()

-1069.986537767466

In [68]:
wind2 = (
    wind.find_replace('usgs_pr_id',{-9999.0:np.nan})
    .find_replace('p_tnum',{-9999.0:np.nan})
    .find_replace('p_cap',{-9999.0:np.nan})
    .find_replace('t_cap',{-9999.0:np.nan})
    .find_replace('t_hh',{-9999.0:np.nan})
    .find_replace('t_rd',{-9999.0:np.nan})
    .find_replace('t_rsa',{-9999.0:np.nan})
    .find_replace('t_ttlh',{-9999.0:np.nan})
)

In [69]:
wind2.head()

Unnamed: 0,case_id,faa_ors,faa_asn,usgs_pr_id,t_state,t_county,t_fips,p_name,p_year,p_tnum,...,t_hh,t_rd,t_rsa,t_ttlh,t_conf_atr,t_conf_loc,t_img_date,t_img_srce,xlong,ylat
0,3073429,missing,missing,4960.0,CA,Kern County,6029,251 Wind,1987,194,...,,,,,2,3,1/1/2012,NAIP,-118.360725,35.083778
1,3071522,missing,missing,4997.0,CA,Kern County,6029,251 Wind,1987,194,...,,,,,2,3,1/1/2012,NAIP,-118.361168,35.081512
2,3073425,missing,missing,4957.0,CA,Kern County,6029,251 Wind,1987,194,...,,,,,2,3,1/1/2012,NAIP,-118.36042,35.084709
3,3071569,missing,missing,5023.0,CA,Kern County,6029,251 Wind,1987,194,...,,,,,2,3,7/31/2016,Digital Globe,-118.364029,35.079418
4,3005252,missing,missing,5768.0,CA,Kern County,6029,251 Wind,1987,194,...,,,,,2,3,11/23/2017,Digital Globe,-118.354286,35.085594


In [70]:
wind2.t_hh.mean()

77.31203064391