# Remove Outliers

####  Goals
* Deskew and Scale the entire DataFrame together
* Use Tukey's Method to define outliers
* Remove outliers from encoded dataset and from dataset.

#### Output
* removed outliers from all datasets.

In [1]:
cd ..

/home/jovyan/dsi/CAPSTONE


In [2]:
%run lib/__init__.py
%matplotlib inline

In [3]:
# the whole dataset. numeric and categorical together.
commute_df = pd.read_pickle('./data/dropped_correlated_features_df.pkl')
commute_df.shape

(1038, 102)

In [4]:
commute_stats_df = pd.read_pickle('./data/commute_stats_dropped_correlated_features_df.pkl')
commute_stats_df.shape

(55, 8)

## 1. Identify Numerical Features in `commute_df`
* Make a DataFrame only containing numerical features

In [5]:
numerical_columns = list(commute_stats_df.index)
len(numerical_columns)

55

In [6]:
numeric_df = commute_df[numerical_columns]
numeric_df.shape

(1038, 55)

In [7]:
# add an amount to everything so that boxcox will work.
numeric_df += 1E-9

## 2. Deskew and Scale Numeric Features
* On numeric features only.
* MinMaxScaler first to ensure values are all positive.

In [8]:
from lib.preprocessing import BoxCoxTransformer

In [9]:
pipeline = Pipeline([
    ('boxcox'  , BoxCoxTransformer()),
    ('ss'      , StandardScaler())
])

In [10]:
numeric_deskewed_scaled = pipeline.fit_transform(numeric_df)

  llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))
  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)
  p = (x - v) * tmp2 - (x - w) * tmp1
  tmp2 = 2.0 * (tmp2 - tmp1)


In [11]:
numeric_deskewed_scaled_df = pd.DataFrame(numeric_deskewed_scaled, 
                                          columns=numeric_df.columns,
                                          index=numeric_df.index)
numeric_deskewed_scaled_df.head()

Unnamed: 0,Response_Rate,Total_Employees,Goal_VMT,Total_VMT,Total_Goal_VMT,Goal_NDAT_Rate_(Worksite_only),Total_Goal_NDAT_Trips,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),GHGforAgg_(Pounds),Total_Weekly_Trips,...,num_employees_using_bike_subsidy,num_employees_using_other_transportation_subsidy,num_parking_spaces_reserved_for_employee_usage,num_HOV_parking_spaces,num_shared_parking_spaces,cost_of_program_in_past_year,cost_of_meeting_program_requirements,cost_of_financial_incentives_subsidies_paid_to_employees,cost_of_facility_upkeep,cost_of_other
0,-0.368481,-0.784134,0.489597,-0.36269,0.194172,0.365784,2.196472,-0.261459,-1.0,-1.0,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
1,-0.385399,-0.339094,0.489597,0.035944,0.194172,0.365784,-0.454947,-0.06661,-1.0,-1.0,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
2,-1.845091,0.470419,0.489597,0.098877,0.194172,0.365784,-0.454947,0.288229,-1.0,-1.0,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
3,-0.452584,0.903457,0.489597,0.736971,0.194172,0.365784,-0.454947,0.617888,-1.0,-1.0,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
4,0.052133,0.147377,-2.045258,-1.069809,-1.977427,-2.078697,-0.454947,-0.702807,-1.0,-1.0,...,-0.470293,-0.371696,0.853722,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307


## 3. Encode Categorical
* Delete UUID columns, do not want to predict on UUID
* Encode `commute_df`, then put deskewed/scaled numeric features into the dataset in appropriate columns.

In [12]:
commute_df.drop(['UUID'], axis=1, inplace=True)

In [13]:
commute_dummies = pd.get_dummies(commute_df)
commute_dummies.head()

Unnamed: 0,Response_Rate,Total_Employees,Goal_VMT,Total_VMT,Total_Goal_VMT,Goal_NDAT_Rate_(Worksite_only),Total_Goal_NDAT_Trips,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),GHGforAgg_(Pounds),Total_Weekly_Trips,...,worksites_transportation_program_priorities_next_six_months-3_na,worksites_transportation_program_priorities_next_six_months-3_none,worksites_transportation_program_priorities_next_six_months-3_offering an electric bike for staff to use instead of SOV,worksites_transportation_program_priorities_next_six_months-3_post CTR program on board,worksites_transportation_program_priorities_next_six_months-3_promotions,worksites_transportation_program_priorities_next_six_months-3_providing more information regarding the program,worksites_transportation_program_priorities_next_six_months-3_put together one event to promote alternative transportation options,worksites_transportation_program_priorities_next_six_months-3_shared ride commutes,worksites_transportation_program_priorities_next_six_months-3_unknown,worksites_transportation_program_priorities_next_six_months-3_walk
0,73.2,183.0,5.7,763.0,625.66,48.8,264.5,284.4,1502.02,542.0,...,0,0,0,0,0,0,0,0,0,0
1,73.0,240.0,5.7,1141.8,625.66,48.8,0.0,349.6,2250.73,854.0,...,0,0,0,0,0,0,0,0,0,0
2,52.8,436.0,5.7,1219.0,625.66,48.8,0.0,515.1,2405.8,1145.0,...,0,0,0,0,0,0,0,0,0,0
3,72.2,650.0,5.7,2438.8,625.66,48.8,0.0,749.1,4779.11,2338.0,...,0,0,0,0,0,0,0,0,0,0
4,78.0,337.0,0.0,390.0,0.0,0.0,0.0,181.0,767.52,751.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
for col in numeric_deskewed_scaled_df.columns:
    commute_dummies[col] = numeric_deskewed_scaled_df[col]

In [15]:
commute_dummies.head()

Unnamed: 0,Response_Rate,Total_Employees,Goal_VMT,Total_VMT,Total_Goal_VMT,Goal_NDAT_Rate_(Worksite_only),Total_Goal_NDAT_Trips,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),GHGforAgg_(Pounds),Total_Weekly_Trips,...,worksites_transportation_program_priorities_next_six_months-3_na,worksites_transportation_program_priorities_next_six_months-3_none,worksites_transportation_program_priorities_next_six_months-3_offering an electric bike for staff to use instead of SOV,worksites_transportation_program_priorities_next_six_months-3_post CTR program on board,worksites_transportation_program_priorities_next_six_months-3_promotions,worksites_transportation_program_priorities_next_six_months-3_providing more information regarding the program,worksites_transportation_program_priorities_next_six_months-3_put together one event to promote alternative transportation options,worksites_transportation_program_priorities_next_six_months-3_shared ride commutes,worksites_transportation_program_priorities_next_six_months-3_unknown,worksites_transportation_program_priorities_next_six_months-3_walk
0,-0.368481,-0.784134,0.489597,-0.36269,0.194172,0.365784,2.196472,-0.261459,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
1,-0.385399,-0.339094,0.489597,0.035944,0.194172,0.365784,-0.454947,-0.06661,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
2,-1.845091,0.470419,0.489597,0.098877,0.194172,0.365784,-0.454947,0.288229,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.452584,0.903457,0.489597,0.736971,0.194172,0.365784,-0.454947,0.617888,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0
4,0.052133,0.147377,-2.045258,-1.069809,-1.977427,-2.078697,-0.454947,-0.702807,-1.0,-1.0,...,0,0,0,0,0,0,0,0,0,0


## 4. Outliers
* Identify outliers in the dataset using Tukey's Method.
* Count outliers in each feature.
* Remove outliers that appear in 4 or more features (this number was chosen as the result represents about 2% of the data).

In [16]:
numeric_deskewed_scaled_df.shape

(1038, 55)

In [17]:
def display_outliers(dataframe, col, tukey_value):
    Q1 = np.percentile(dataframe[col], 25)
    Q3 = np.percentile(dataframe[col], 75)
    tukey_window = tukey_value*(Q3-Q1)
    less_than_Q1 = dataframe[col] < (Q1 - tukey_window)
    greater_than_Q3 = dataframe[col] > (Q3 + tukey_window)
    tukey_mask = (less_than_Q1 | greater_than_Q3)
    return dataframe[tukey_mask]

In [18]:
outliers = []
for col in numeric_deskewed_scaled_df:
    outlier_df = display_outliers(numeric_deskewed_scaled_df, col, 2.25)
    print(col, outlier_df.shape)
    outliers.append((col, outlier_df))

Response_Rate (4, 55)
Total_Employees (0, 55)
Goal_VMT (189, 55)
Total_VMT (0, 55)
Total_Goal_VMT (193, 55)
Goal_NDAT_Rate_(Worksite_only) (193, 55)
Total_Goal_NDAT_Trips (178, 55)
Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e) (0, 55)
GHGforAgg_(Pounds) (0, 55)
Total_Weekly_Trips (0, 55)
Weekly_Carpool_Trips (2, 55)
Weekly_Vanpool_Trips (0, 55)
Weekly_1-Motorcycle_Trips (0, 55)
Weekly_2-Motorcycle_Trips (250, 55)
Weekly_Bus_Trips (11, 55)
Weekly_Train/Lightrail/Street_Car_Trips (78, 55)
Weekly_Bike_Trips (84, 55)
Weekly_Walk_Trips (59, 55)
Weekly_Telework_Trips (4, 55)
Weekly_CWW_Days (0, 55)
Weekly_Overnight_Business_Trip (0, 55)
Weekly_Did_Not_Work (30, 55)
Weekly_Used_Ferry_as_Walk-on_Passenager (0, 55)
Weekly_Boarded_Ferry_with_Car/Bus/Van (0, 55)
Weekly_Other_Trips (7, 55)
Alone_Share (0, 55)
Carpool_Share (0, 55)
Van_Share (0, 55)
Motorcycle_Share (0, 55)
Bus_Share (0, 55)
Train_Share (0, 55)
Bike_Share (1, 55)
Walk_Share (0, 55)
Tele_Share (0, 55)
CWW

In [19]:
from collections import Counter
outlier_indices = [list(out[1].index) for out in outliers]
outlier_indices = [outlier_index for sublist in outlier_indices for outlier_index in sublist]

cn = Counter(outlier_indices)

outlier_counts = pd.DataFrame([{'id': key, 'count': val} for key, val in cn.items()])
outlier_counts.set_index('id', inplace=True)
outlier_counts.sort_values('count', ascending=False, inplace=True)
outlier_counts[outlier_counts['count'] > 5].count()

count    27
dtype: int64

In [20]:
(27/1042)*100

2.5911708253358925

In [21]:
outliers = outlier_counts[outlier_counts['count'] > 5]
outliers

Unnamed: 0_level_0,count
id,Unnamed: 1_level_1
657,11
658,10
660,9
659,9
594,7
596,7
29,7
350,7
593,7
468,7


In [22]:
outlier_indices = list(outliers.index)

In [23]:
removed_outliers_full_df = commute_dummies.drop(outlier_indices)
removed_outliers_full_df.shape

(1011, 1174)

In [24]:
numeric_removed_outliers_df = numeric_deskewed_scaled_df.drop(outlier_indices)
numeric_removed_outliers_df.shape

(1011, 55)

In [25]:
removed_outliers_raw_df = commute_df.drop(outlier_indices)
removed_outliers_raw_df.shape

(1011, 101)

## 5. Pickling and Saving

In [26]:
# full dataframe, deskewed and scaled, with removed outliers.
removed_outliers_full_df.to_pickle('./data/removed_outliers_full_df.pkl')

# numeric dataframe, deskewed and scaled, with removed outliers
numeric_removed_outliers_df.to_pickle('./data/numeric_removed_outliers_df.pkl')

# raw dataframe with removed outliers to deskew and scale again.
removed_outliers_raw_df.to_pickle('./data/removed_outliers_raw_df.pkl')