# Feature Scaling

In [None]:
# Some of this code is from 1nhee/space on github

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Create a DataFrame with some data.
data_train = pd.DataFrame({
    'age': [20, 25, 30, 35, 40],
    'income': [50000, 60000, 70000, 80000, 90000],
    'height' : [1.6, 1.7, 1.8, 1.5, 1.55]
})
data_test = pd.DataFrame({
    'age': [18, 5, 20, 30, 17],
    'income': [9000,8000,7000,6000,5000],
    'height' : [1.2, 2.0, 1.9, 1.5, 1.6]
})


In [None]:
data_train

In [None]:
data_test

In [None]:

# Scale the data using StandardScaler.
scaler1 = StandardScaler()
scaler1.fit(data_train[['age', 'income']])
data_train[['age_std_scale', 'income_std_scale']] = scaler1.transform(data_train[['age', 'income']])
data_test[['age_std_scale', 'income_std_scale']] = scaler1.transform(data_test[['age', 'income']])

# Scale the data using MinMaxScaler.
scaler2 = MinMaxScaler()
scaler2.fit(data_train[['age', 'income']])
data_train[['age_minmax_scale', 'income_minmax_scale']] = scaler2.transform(data_train[['age', 'income']])
data_test[['age_minmax_scale', 'income_minmax_scale']] = scaler2.transform(data_test[['age', 'income']])


In [None]:
data_train

In [None]:
data_test

In [None]:
# We can peak into the mean, the stdev of the std scaler
# (from the data before the transformation)
print (scaler1.mean_)
print (scaler1.scale_)

In [None]:
# We can peak into the min, the max of the minmax scaler
# (from the training data before the transformation)
print (scaler2.data_min_)
print (scaler2.data_max_)

In [None]:
# Data scaling is really easy, we could have done this "manually"
min_ = data_train[["age", "income"]].min()
max_ = data_train[["age", "income"]].max()
print (min_)
print (max_)

In [None]:
data_train[["age_minmax_scale2", "income_minmax_scale2"]] = (data_train[["age", "income"]]  - min_)/(max_ - min_)
data_train

In [None]:
data_test[["age_minmax_scale2", "income_minmax_scale2"]] = (data_test[["age", "income"]]  - min_)/(max_ - min_)
data_test

# Outlier detection

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
                   'b': [8,6,4,2, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]})

data


## Outlier detection using z-score

In [None]:

# z-score done using standard scaler to get z-scores
scaler = StandardScaler()
scaler.fit(data[['a', 'b']])
data[['a_scaled', 'b_scaled']] = scaler.transform(data[['a', 'b']])

print ("Data after z-score computation (standard scaler")
data

In [None]:
# Mark all those with zscore >1.6 or <-1.6 as outliers (missing data)
# Always safer to create a new column rather than working "in place"
data["a_outliers_removed"] = data["a"].where(data["a_scaled"].abs()<1.6)
data["b_outliers_removed"] = data["b"].where(data["b_scaled"].abs()<1.6)
data

In [None]:
# Let's remove the rows with missing data
data_rows_removed = data.dropna(axis=0) # dropping rows (you can also remove columns with axis=1)
data_rows_removed

## Outlier detection using IQR

In [None]:
# Let's create some fresh data first
data = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 500, 12, 13, 14, 15, 16],
                   'b': [8,6,4,2, 10, -100, 14, 16, 18, 20, 22, 24, 26, 28, 30, 100]})

# Make sure you understand what is printed
quantiles = data.quantile([0.25, 0.75])
quantiles

In [None]:
IQR = quantiles.loc[0.75] - quantiles.loc[0.25]
IQR

In [None]:
# Let's do IQR outlier removal with alpha = 1.5
# This time let's do it in-place
data["a"] = data["a"].where(
    (data["a"] <= quantiles["a"][0.75] + IQR["a"] * 1.5) &
    (data["a"] >= quantiles["a"][0.25] - IQR["a"] * 1.5))
data["b"] = data["b"].where(
    (data["b"] <= quantiles["b"][0.75] + IQR["b"] * 1.5) &
    (data["b"] >= quantiles["b"][0.25] - IQR["b"] * 1.5))

In [None]:
data

In [None]:
data_rows_removed = data.dropna(axis=0)
data_rows_removed

# Missing Value Handling

In [None]:
# Let's create some fresh data first
data = pd.DataFrame({
    'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None, 12, 13, 14, 15, 16],
    'b': [8,6,4,2, 10, None, 14, 16, 18, 20, 22, 24, 26, 28, 30, None],
    'c' : [100, None, None, None, 5, None, None, None,100, None, None, None, 5, None, None, None]})



In [None]:
# Note that you can compute stats on columns with missing data
# The missing data will be simply ignored
print(data['a'].mean())
print(data['b'].std())

## Removing columns with rate of missing values threshold


In [None]:
# How many missing values in each column?
num_missing = data.isna().sum()
num_missing

In [None]:
data_cols_removed = data.loc[:, num_missing<=6]
data_cols_removed

## Imputation with mean and median

In [None]:
data_cols_removed_mean_imp = data_cols_removed.fillna(data_cols_removed.mean())
data_cols_removed_mean_imp

In [None]:
data_cols_removed_median_imp = data_cols_removed.fillna(data_cols_removed.median())
data_cols_removed_median_imp

# Feature Transformations

In [None]:
# Let's create some fresh data first
data = pd.DataFrame({
    'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 12, 13, 14, 15, 16],
    'b': [8,6,4,2, 10, 19, 14, 16, 18, 20, 22, 24, 26, 28, 30, 21]})

In [None]:
# Logarithm, Box-Cox transformation with lambda-2
data["log_a"] = np.log(data["a"])
data["box_cox2_b"] = (np.power(data["b"], 2) - 1) / 2
data
# could also use box-cox from here:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.boxcox.html


In [None]:
# Bucketization/discretization
data["a_bucketized"] = pd.cut(data["a"], bins=5) # number of bins, equally spaced
data

In [None]:
# Bucketization/discretization
# Specifying the bin endpoints
data["a_bucketized2"] = pd.cut(data["a"], bins=[-100,3.5, 9.5, 10])
data

In [None]:
# interaction variables
data["ab"] = data["a"] * data["b"]
data

In [None]:
# notice that the bucketized columns are categorical
data.dtypes

# Dealing with categorical variables

In [None]:
data = pd.DataFrame({
    'boro': ['Brooklyn', 'Bronx', 'Bronx', 'Manhattan', 'Queens', 'Brooklyn'],
    'salary' : [1000, 2000, 3000, 4000, 5000, 6000],
    'satisfaction' : [4,3,3,5,1,2]})
data.dtypes

In [None]:
# First, we want to cast the variable to type category
data["boro"] = data["boro"].astype("category")
data["satisfaction"] = data["satisfaction"].astype("category")
data.dtypes

In [None]:
# Categories inferred from data.
data["satisfaction"].cat.categories

In [None]:
# Notice that a boro named "Staten Island" is missing from the data.
# In cases where we want to determine the categories in advance
# (before we see the data), the right way to do this is as follows
boro_type = pd.CategoricalDtype(categories=['Manhattan', 'Bronx', 'Brooklyn', "Queens", "Staten Island"])
data["boro"] = data["boro"].astype(boro_type)
data.dtypes

In [None]:
# ordinal encoding
# Note that the encoding is based on the order that we determined in the
# previous cell
data["boro_ordinal"] = data["boro"].cat.codes
data

In [None]:
# One-hot encoding
data_dummies = pd.get_dummies(data, columns=['boro', 'satisfaction'])
data_dummies

In [None]:
# Impact (Stats) encoding
stats = data['salary'].groupby(data['boro']).agg(['mean'])
stats

In [None]:
mapper = {s : stats.loc[s,'mean'] for s in stats.index }
data["boro_impact"] = data["boro"].map(mapper)

In [None]:
data

# Feature Selection (Filter Method)

In [None]:
# Let's create synthetic data with label 0,1,2,...99
# x1 the same as the label plus some small noise
# x2 is minus the label, plus some small noise
# x3 is simply random
data = pd.DataFrame({
    'x1' : np.arange(100) + np.random.rand(100) * 10,
    'x2' : -np.arange(100) + np.random.rand(100) * 20,
    'x3' : np.random.rand(100),
    'y' : np.arange(100)
})
data.head()

## Let's compute different types of correlation between all features, and y

In [None]:
pearson = data.corr(method='pearson')
pearson

In [None]:
spearman = data.corr(method="spearman")
spearman

In [None]:
# Now we can rank the features x1,x2,x3 in decreasing abs-value-of-corr-with-y
# order.
# (You can do the same thing using Pearson)
sorted = spearman['y'].abs().sort_values(ascending=False)
sorted

In [None]:
# Select the first two.  Don't forget to skip the first in the sorted list, which
# is y itself.
chosen_features = sorted.iloc[1:3].index
chosen_features

# Handling Imbalanced Data

In [None]:
data = pd.DataFrame({
    'x' : np.random.rand(1000),
    'y' : (np.random.rand(1000) > 0.9) # 90% False, 10% True
})
data['y'].value_counts()

In [None]:
# First let's create arrays of false and true indices
false_indices = data.index[data['y']==False]
true_indices = data.index[data['y']==True]

In [None]:
# Subsample the abundant class
# In next line, replace=False means that we don't allow repeted samples
subsampled_false_indices = np.random.choice(false_indices, size = 100,replace=False)
subsampled_data = pd.concat([data.loc[subsampled_false_indices], data.loc[true_indices]])
subsampled_data['y'].value_counts()

In [None]:
#Oversample the rare class
# In the next line, replace has to be True because there is no way to
# create more positive samples without repetition.
oversampled_true_indices = np.random.choice(true_indices, size = 1000,replace=True)
oversampled_data = pd.concat([data.loc[false_indices], data.loc[oversampled_true_indices]])
oversampled_data['y'].value_counts()

# Splitting to Train/Val/Test

In [None]:
from sklearn.model_selection import train_test_split
data = pd.DataFrame({
    'x1' : np.random.rand(1000),
    'x2' : np.random.rand(1000),
    'y' : (np.random.rand(1000) > 0.5)
})
train, test = train_test_split(data, test_size=0.1, random_state=111)

In [None]:
data.shape

In [None]:
train.shape

In [None]:
test.shape

In [None]:
# Let's also get a validation set
train, val = train_test_split(train, test_size=0.1, random_state=222)

In [None]:
train.shape

In [None]:
val.shape

In [None]:
# We can also do this by shuffling and the slicing
data_shuffled = data.sample(frac=1) # random shuffle in Pandas
train = data_shuffled.iloc[:700]
val = data_shuffled.iloc[700:850]
test = data_shuffled.iloc[850:]

In [None]:
train.shape

In [None]:
val.shape

In [None]:
test.shape