# annual_outliers_py

---

## Determine growth and decline outliers for hpi change over time for each year individually

In [6]:
# Dependencies and Setup
import pandas as pd

# Import hpi-over-time data from cleaned file
file_path = "data/cleaned data/cleaned_hpi_price_data.csv"
hpi_df = pd.read_csv(file_path)

# Check that data seems to have loaded correctly
hpi_df.head()

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI from 2012,HPI with 2012 base,normalized_sale_price,zip_code,normalized_sale_price_2012,result
0,1001.0,1984,.,100.0,279.95,0.357207,58369.68857,1001.0,163405.943152,58369.68857
1,1001.0,1985,16.00,116.0,279.95,0.41436,67708.838742,1001.0,163405.943152,67708.838742
2,1001.0,1986,14.21,132.48,279.95,0.473227,77328.163418,1001.0,163405.943152,77328.163418
3,1001.0,1987,21.08,160.41,279.95,0.572995,93630.817436,1001.0,163405.943152,93630.817436
4,1001.0,1988,17.63,188.68,279.95,0.673977,110131.928394,1001.0,163405.943152,110131.928394


### Inspect the data for some general properties

In [7]:
# Find the most recent year that has data
last_year = hpi_df['Year'].max()

# Find the oldest year that has data
first_year = hpi_df['Year'].min()

# Count how many unique years have data
year_count = hpi_df['Year'].unique().size

# Count how many unique zip codes and collect them into a list for iterating
zip_count = hpi_df['zip_code'].unique().size
zip_list = hpi_df['zip_code'].unique()

# Display some information that has been inspected
print(f"There is data for {year_count} years, from {first_year} to {last_year}, covering {zip_count} unique zip codes")

print(hpi_df.dtypes)

# NOTE:  Different zip codes have different years available

There is data for 48 years, from 1975 to 2022, covering 13000 unique zip codes
Five-Digit ZIP Code           float64
Year                            int64
Annual Change (%)              object
HPI                           float64
HPI from 2012                 float64
HPI with 2012 base            float64
normalized_sale_price         float64
zip_code                      float64
normalized_sale_price_2012    float64
result                        float64
dtype: object


In [8]:
# Create a new dataframe that removes rows with no Annual Change value
annual_growth_df = hpi_df.loc[hpi_df['Annual Change (%)'] != '.']

# Change Annual Change (%) into a number for doing math operations
annual_growth_df['Annual Change (%)'] = annual_growth_df['Annual Change (%)'].astype(float)

# Sort the dataframe by year ascending so calculated results will tabulate in an intuitive way
annual_growth_df = annual_growth_df.sort_values(by=['Year'], ascending=True)

# Check that the data type change was successful
annual_growth_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annual_growth_df['Annual Change (%)'] = annual_growth_df['Annual Change (%)'].astype(float)


Five-Digit ZIP Code           float64
Year                            int64
Annual Change (%)             float64
HPI                           float64
HPI from 2012                 float64
HPI with 2012 base            float64
normalized_sale_price         float64
zip_code                      float64
normalized_sale_price_2012    float64
result                        float64
dtype: object

---

## Determine growth and decline outliers for each year individually

In [9]:
# Create empty dataframes to store results in
annual_outlier_zips_df = pd.DataFrame()
annual_outlier_values_df = pd.DataFrame()

# Collect the years in the dataframe that contain Annual Change data into a list for iterating
years_list = annual_growth_df['Year'].unique()

# Iterate through the years and deterimine outliers for that year only
for year in years_list:
    
    # Collect the current year only for each iteration
    current_year_df = annual_growth_df.loc[annual_growth_df['Year']==year]
    
    # Determine outliers using upper and lower bounds for that year and collect their zip codes
    quartiles = current_year_df['Annual Change (%)'].quantile([0.25, 0.50, 0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    # Collect the zip codes that are outliers
    outlier_zips = current_year_df.loc[(current_year_df['Annual Change (%)'] > upper_bound) 
                                     | (current_year_df['Annual Change (%)'] < lower_bound)]['zip_code']
    
    # This series ends up with name 'zip_code', but gets changed to the current year for appending to the results dataframe
    outlier_zips.name = year
    
    # Add outlier zip codes to dataframe under the current year
    annual_outlier_zips_df = pd.concat([annual_outlier_zips_df, outlier_zips], axis=1)
    
    # Collect the Annual Change values for the outlying zip codes
    outlier_values = current_year_df.loc[(current_year_df['Annual Change (%)'] > upper_bound) 
                                     | (current_year_df['Annual Change (%)'] < lower_bound)]['Annual Change (%)']
    
    # This series ends up with name 'zip_code', but gets changed to the current year for appending to the results dataframe
    outlier_values.name = year
    
    # Add outlier values to dataframe under the current year
    annual_outlier_values_df = pd.concat([annual_outlier_values_df, outlier_values], axis=1)
      
# Save the results to a file
annual_outlier_zips_df.to_csv('data/analysis/annual_outlier_zips.csv')
annual_outlier_values_df.to_csv('data/analysis/annual_outlier_values.csv')

# Inspect the results
print(annual_outlier_zips_df.head())
print(annual_outlier_values_df.head())

           1976  1977  1978  1979  1980  1981  1982  1983  1984  1985  ...  \
155792  33156.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
389923  91301.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
170785  34982.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
423155  95070.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
333135  75243.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

        2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  
155792   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
389923   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
170785   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
423155   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
333135   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  

[5 rows x 47 columns]
         1976  1977  1978  1979  1980  1981  1982  1983  1984  1985  ...  \
155792 -10.70   Na