In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# https://www.kaggle.com/muonneutrino/us-census-demographic-data
census = pd.read_csv('acs2015_county_data.csv')
pd.set_option("display.max_rows", None, "display.max_columns", None)
census.head()

In [None]:
census.dtypes

In [None]:
census.describe()

In [None]:
np.sum(census.isnull(), axis = 0)

In [None]:
census.dropna(inplace = True)

## Investigating the Income Variable in more detail

In [None]:
census['Income'].hist()

In [None]:
sns.distplot(census['Income'])
plt.show()

In [None]:
sns.boxplot(census['Income'])

In [None]:
from scipy import stats
stats.describe(census['Income'])

# Some Options for Standardizing Data

### Dropping Outliers

In [None]:
iqr = np.percentile(census['Income'],75) - np.percentile(census['Income'],25)
upper_limit = np.percentile(census['Income'],75) + 1.5*iqr
lower_limit = np.percentile(census['Income'],25) - 1.5*iqr

In [None]:
income_pruned = census['Income'][(census['Income']>lower_limit) & (census['Income']<upper_limit)]

In [None]:
sns.boxplot(income_pruned)

In [None]:
sns.distplot(income_pruned)
plt.show()

In [None]:
stats.describe(income_pruned)

### Log Transformation

In [None]:
income_log = np.log(census['Income'])
sns.distplot(income_log)
plt.show()

In [None]:
sns.boxplot(income_log)

In [None]:
stats.describe(income_log)

### Box Cox transformation

In [None]:
help(stats.boxcox)

In [None]:
income_bc, lmbda = stats.boxcox(census['Income'])
income_bc

In [None]:
stats.describe(income_bc)

In [None]:
sns.boxplot(income_bc)
plt.show()

In [None]:
sns.distplot(income_bc)
plt.show()

### Scikit Learn StandardScaler
Standard Scaler changes each *feature column*. It transforms the data to have mean 0 and standard deviation 1 (e.g. normal distribution becomes a standard normal distribution). It is often used in classification problems. StandardScaler and other scalers that work feature-wise are preferred when you are interested in the relationship between different variables (for example head length and body length of crocodiles).

In [None]:
from sklearn.preprocessing import StandardScaler
X = census.select_dtypes(include = np.number).dropna()
# Normalizing data
transformer = StandardScaler().fit(X)
pd.DataFrame(transformer.transform(X))

### Scikit Learn Normalizer

Normalizer changes each sample individually (!) and squeezes the values to be between 0 and 1.
This means that the distribution of the data at the *feature* level will change drastically. Normalizer and other scalers that work sample-wise should be preferred when you are interested in how one observation is linked to another, instead of exploring relationships between variables.

Example: when we are working with text data and are looking at the relationships between different documents, which is their topic, what are the word frequencies in each etc.

In [None]:
from sklearn.preprocessing import Normalizer
help(Normalizer)

In [None]:
X = census.select_dtypes(include = np.number)
# Normalizing data
transformer = Normalizer().fit(X)
pd.DataFrame(transformer.transform(X))

### Other options: Creating a categorical variable out of a numerical one

In [None]:
census['IncomeRange'] = pd.cut(census.Income, 3) 
census.IncomeRange.value_counts()

In [None]:
census['IncomePercentile'] = pd.qcut(census.Income, [0, 0.3333, 0.6666, 1])
census.IncomePercentile.value_counts()

In [None]:
census['HighUnemploymentLowIncome'] = np.where((census.Unemployment > 0.07) & (census.IncomePercentile.cat.codes == 0), 1, 0) 
census.HighUnemploymentLowIncome.value_counts()

### Checking Correlations

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(census.corr())

Features with high correlation:
- TotalPop, Men and Women, and Citizen and Employed
- Poverty and ChildPoverty
Remember: we want high correlation with the target variable, but low correlation of the features to one another!

### Encoding Categorical Variables

In [None]:
# extracting only categorical variables
cat = census.select_dtypes(include = 'object')
print(cat.columns)

In [None]:
cat['State'].value_counts()

#### LabelEncoder

In [None]:
# encoding one variable at a time
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(cat['State'])

In [None]:
print(le.classes_)
print(le.transform(le.classes_))
dict(zip(le.classes_, le.transform(le.classes_)))

In [None]:
print(le.transform(['Wyoming']))
print(le.inverse_transform([51]))

The **LabelEncoding** method will create a column which contains all the numerical values, where each number is mapped to a certain category. ML models will be able to deal with this, since now we are looking at a numerical column - on the down side, algorithms might interpret it as a numerical (linear) variable instead of a categorical. Another approach is OneHotEncoding, which will gives us dummies for each variable separately.

#### OneHotEncoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

enc_df = pd.DataFrame(enc.fit_transform(cat).toarray())

In [None]:
enc_df.head()

In [None]:
enc_df.sum(axis = 1)

In [None]:
# merge with main df bridge_df on key values
cat = cat.join(enc_df)
cat.head()

#### Get Dummies

In [None]:
pd.get_dummies(cat, columns=['State'])

### Storing models, encodings etc: Pickle

In [None]:
import pickle
pickle.dump(enc, open('encoding_state.pkl','wb'))
# Pickle module uses binary protocol (wb stands for write binary)

In [None]:
help(pickle.dump)

In [None]:
model = pickle.load(open('encoding_state.pkl','rb'))

In [None]:
model

In [None]:
model.fit_transform(cat)