### Import packages

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In this lab, we will review:

1. Plotting
2. Recoding
3. Correlation

### Reads files

In [None]:
df = pd.read_csv('clean_property_data.csv')
df.head()

In [None]:
len(df)

In [None]:
var = df['price_000']

q_75 = np.quantile(var, 0.75)
q_25 = np.quantile(var, 0.25)
q_50 = np.quantile(var, 0.5) ### this is also the median

iqr_calc = q_75 - q_25 ### this should give the same output as the function scipy.stats.iqr()

# if we want to drop the outliers
df_new = df[(var > (q_25 - 1.5 * iqr_calc))&(var < (q_75 + 1.5 * iqr_calc))]

len(df_new)

## Recoding

##### Creating dummy SES variable

In [None]:
# Counts
df.groupby('ses').size()

In [None]:
df['ses'].value_counts()

In [None]:
# Creating the dummy
df['ses_dummy'] = np.where((df['ses'] == 5)|(df['ses'] == 6), 1, 0)

In [None]:
# alternatively, use .loc
df['sea_dummy'] = 0 # create dummy varaible
df.loc[((df['ses'] == 5) | (df['ses'] == 6)),'ses_dummy']=1
df

In [None]:
### Data check!
df.groupby('ses_dummy').size()

##### Create binary density variable

In [None]:
### Compute median
pop_dens_med = np.median(df['pop_dens'])
pop_dens_med

In [None]:
### Creating the dummy
df['pop_dens_dummy'] = np.where((df['pop_dens']>= pop_dens_med), 1, 0)

In [None]:
### Data check!
df.groupby('pop_dens_dummy').size()

##### Recode the number of bathrooms variable

In [None]:
### Counts
df.groupby('num_bath').size()

In [None]:
### Creating the dummy
df['num_bath_cat'] = np.where((df['num_bath']>= 3), 3, df['num_bath'])

##### Standardizing the Area variable

<img src = 'standarization.jpg' width = 500>
source: https://365datascience.com/standardization/

In [None]:
### Describe the data
df['area_m2'].describe()

In [None]:
x = df['area_m2']

area_mean = np.mean(x)
area_SD = np.std(x) 

print('area mean:', round(area_mean, 2))
print('area SD:', round(area_SD, 2))

In [None]:
df['area_stand'] = (df['area_m2']- area_mean)/area_SD

In [None]:
df['area_stand'].describe()

In [None]:
x = df['area_stand']

area_std_mean = np.mean(x)
area_std_SD = np.std(x) 

print('area mean:', round(area_std_mean, 2))
print('area SD:', round(area_std_SD, 2))

##### Compare area histograms

In [None]:
### Area vs standardized area
plt.hist(df['area_m2'], 250)
plt.show()

plt.hist(df['area_stand'], 250)
plt.show()

##### Compare density histograms

In [None]:
### Population density vs Population density dummy
plt.hist(df['pop_dens'], 250)
plt.show()

plt.hist(df['pop_dens_dummy'])
plt.show()

## Scatterplots

##### Scatterplot price vs SES (6 categories)

In [None]:
### Price vs SES & price vs SES categories
x = df['price_000']
y1 = df['ses']
y2 = df['ses_dummy']
plt.scatter(x, y1)
plt.show()

plt.scatter(x, y2)
plt.show()

### Two continuous variables

In [None]:
df.columns

In [None]:
# we could also plot using seaborn
import seaborn as sns

plt.figure(figsize=(12,8))

sns.set(font_scale=2) # set font size
sns.scatterplot(x='area_m2', y='price_000', data=df)
plt.xlabel('area(m2)', fontsize = 12)
plt.ylabel('price(000)', fontsize = 12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.title('Relationship between price and area', fontsize = 18);

https://seaborn.pydata.org/generated/seaborn.scatterplot.html

## Pearson Correlation

In [None]:
# Create pairwise correlation matrix
df.corr() # default method is Pearson correlation

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html

In [None]:
# Create correlation heatmap
plt.figure(figsize=(12,8)) 
sns.heatmap(df.drop(['Unnamed: 0', 'id'], axis = 1).corr(), cmap="crest");

https://seaborn.pydata.org/generated/seaborn.heatmap.html