# Remove Outliers

####  Goals
* Deskew and Scale the entire DataFrame together
* Use Tukey's Method to define outliers
* Remove outliers from encoded dataset and from dataset.

#### Output
* removed outliers from all datasets.

In [1]:
cd ..

/home/jovyan/Capstone


In [2]:
%run lib/__init__.py
%matplotlib inline

In [3]:
# the whole dataset. numeric and categorical together.
commute_df = pd.read_pickle('./data/dropped_correlated_features_df.pkl')
commute_df.shape

(1038, 87)

In [4]:
commute_stats_df = pd.read_pickle('./data/commute_stats_dropped_correlated_features_df.pkl')
commute_stats_df.shape

(40, 8)

## 1. Identify Numerical Features in `commute_df`
* Make a DataFrame only containing numerical features

In [5]:
numerical_columns = list(commute_stats_df.index)
len(numerical_columns)

40

In [6]:
numeric_df = commute_df[numerical_columns]
numeric_df.shape

(1038, 40)

In [7]:
# add an amount to everything so that boxcox will work.
numeric_df += 1E-9

## 2. Deskew and Scale Numeric Features
* On numeric features only.
* MinMaxScaler first to ensure values are all positive.

In [8]:
from lib.preprocessing import BoxCoxTransformer

In [9]:
pipeline = Pipeline([
    ('boxcox'  , BoxCoxTransformer()),
    ('ss'      , StandardScaler())
])

In [10]:
numeric_deskewed_scaled = pipeline.fit_transform(numeric_df)

In [11]:
numeric_deskewed_scaled_df = pd.DataFrame(numeric_deskewed_scaled, 
                                          columns=numeric_df.columns,
                                          index=numeric_df.index)
numeric_deskewed_scaled_df.head()

Unnamed: 0,Response_Rate,Total_Employees,VMT/ Employee,Goal_VMT,Total_VMT,Total_Goal_VMT,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),Daily_Roundtrip_GHG_Per_Employee_(Pounds),Weekly_CWW_Days,Weekly_Overnight_Business_Trip,...,num_employees_using_bike_subsidy,num_employees_using_other_transportation_subsidy,num_parking_spaces_reserved_for_employee_usage,num_HOV_parking_spaces,num_shared_parking_spaces,cost_of_program_in_past_year,cost_of_meeting_program_requirements,cost_of_financial_incentives_subsidies_paid_to_employees,cost_of_facility_upkeep,cost_of_other
0,-0.368481,-0.784134,0.367011,0.489597,-0.36269,0.194172,-0.261459,0.388165,0.660962,-1.427542,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
1,-0.385399,-0.339094,0.269204,0.489597,0.035944,0.194172,-0.06661,0.288905,-1.251008,0.416486,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
2,-1.845091,0.470419,-0.087518,0.489597,0.098877,0.194172,0.288229,-0.058381,-1.251008,0.55759,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
3,-0.452584,0.903457,-0.117914,0.489597,0.736971,0.194172,0.617888,-0.104605,0.556009,0.819191,...,2.12808,2.691003,-1.170573,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307
4,0.052133,0.147377,-1.219819,-2.045258,-1.069809,-1.977427,-0.702807,-1.204844,-1.251008,0.868288,...,-0.470293,-0.371696,0.853722,-0.681112,-0.643447,-1.443943,-1.21186,-1.105346,-0.631492,-0.425307


## 3. Encode Categorical
* Delete UUID columns, do not want to predict on UUID
* Encode `commute_df`, then put deskewed/scaled numeric features into the dataset in appropriate columns.

In [12]:
commute_df.drop(['UUID'], axis=1, inplace=True)

In [13]:
commute_dummies = pd.get_dummies(commute_df)
commute_dummies.head()

Unnamed: 0,Response_Rate,Total_Employees,VMT/ Employee,Goal_VMT,Total_VMT,Total_Goal_VMT,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),Daily_Roundtrip_GHG_Per_Employee_(Pounds),Weekly_CWW_Days,Weekly_Overnight_Business_Trip,...,worksites_transportation_program_priorities_next_six_months-3_na,worksites_transportation_program_priorities_next_six_months-3_none,worksites_transportation_program_priorities_next_six_months-3_offering an electric bike for staff to use instead of SOV,worksites_transportation_program_priorities_next_six_months-3_post CTR program on board,worksites_transportation_program_priorities_next_six_months-3_promotions,worksites_transportation_program_priorities_next_six_months-3_providing more information regarding the program,worksites_transportation_program_priorities_next_six_months-3_put together one event to promote alternative transportation options,worksites_transportation_program_priorities_next_six_months-3_shared ride commutes,worksites_transportation_program_priorities_next_six_months-3_unknown,worksites_transportation_program_priorities_next_six_months-3_walk
0,73.2,183.0,7.0,5.7,763.0,625.66,284.4,13.8,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,73.0,240.0,6.6,5.7,1141.8,625.66,349.6,13.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,52.8,436.0,5.3,5.7,1219.0,625.66,515.1,10.5,0.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,72.2,650.0,5.2,5.7,2438.8,625.66,749.1,10.2,1.0,13.0,...,0,0,0,0,0,0,0,0,0,0
4,78.0,337.0,2.5,0.0,390.0,0.0,181.0,4.9,0.0,16.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
for col in numeric_deskewed_scaled_df.columns:
    commute_dummies[col] = numeric_deskewed_scaled_df[col]

In [15]:
commute_dummies.head()

Unnamed: 0,Response_Rate,Total_Employees,VMT/ Employee,Goal_VMT,Total_VMT,Total_Goal_VMT,Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e),Daily_Roundtrip_GHG_Per_Employee_(Pounds),Weekly_CWW_Days,Weekly_Overnight_Business_Trip,...,worksites_transportation_program_priorities_next_six_months-3_na,worksites_transportation_program_priorities_next_six_months-3_none,worksites_transportation_program_priorities_next_six_months-3_offering an electric bike for staff to use instead of SOV,worksites_transportation_program_priorities_next_six_months-3_post CTR program on board,worksites_transportation_program_priorities_next_six_months-3_promotions,worksites_transportation_program_priorities_next_six_months-3_providing more information regarding the program,worksites_transportation_program_priorities_next_six_months-3_put together one event to promote alternative transportation options,worksites_transportation_program_priorities_next_six_months-3_shared ride commutes,worksites_transportation_program_priorities_next_six_months-3_unknown,worksites_transportation_program_priorities_next_six_months-3_walk
0,-0.368481,-0.784134,0.367011,0.489597,-0.36269,0.194172,-0.261459,0.388165,0.660962,-1.427542,...,0,0,0,0,0,0,0,0,0,0
1,-0.385399,-0.339094,0.269204,0.489597,0.035944,0.194172,-0.06661,0.288905,-1.251008,0.416486,...,0,0,0,0,0,0,0,0,0,0
2,-1.845091,0.470419,-0.087518,0.489597,0.098877,0.194172,0.288229,-0.058381,-1.251008,0.55759,...,0,0,0,0,0,0,0,0,0,0
3,-0.452584,0.903457,-0.117914,0.489597,0.736971,0.194172,0.617888,-0.104605,0.556009,0.819191,...,0,0,0,0,0,0,0,0,0,0
4,0.052133,0.147377,-1.219819,-2.045258,-1.069809,-1.977427,-0.702807,-1.204844,-1.251008,0.868288,...,0,0,0,0,0,0,0,0,0,0


## 4. Outliers
* Identify outliers in the dataset using Tukey's Method.
* Count outliers in each feature.
* Remove outliers that appear in 4 or more features (this number was chosen as the result represents about 2% of the data).

In [16]:
numeric_deskewed_scaled_df.shape

(1038, 40)

In [17]:
def display_outliers(dataframe, col, tukey_value):
    Q1 = np.percentile(dataframe[col], 25)
    Q3 = np.percentile(dataframe[col], 75)
    tukey_window = tukey_value*(Q3-Q1)
    less_than_Q1 = dataframe[col] < (Q1 - tukey_window)
    greater_than_Q3 = dataframe[col] > (Q3 + tukey_window)
    tukey_mask = (less_than_Q1 | greater_than_Q3)
    return dataframe[tukey_mask]

In [20]:
outliers = []
for col in numeric_deskewed_scaled_df:
    outlier_df = display_outliers(numeric_deskewed_scaled_df, col, 1.70)
    print(col, outlier_df.shape)
    outliers.append((col, outlier_df))

Response_Rate (13, 40)
Total_Employees (1, 40)
VMT/
Employee (0, 40)
Goal_VMT (189, 40)
Total_VMT (4, 40)
Total_Goal_VMT (198, 40)
Total_Annual_Greenhouse_Gas_Emissions_-__All_Employees_(Metric_Tons_CO2e) (5, 40)
Daily_Roundtrip_GHG_Per_Employee_(Pounds) (0, 40)
Weekly_CWW_Days (0, 40)
Weekly_Overnight_Business_Trip (0, 40)
Weekly_Did_Not_Work (38, 40)
Alone_Share (0, 40)
Carpool_Share (1, 40)
Van_Share (0, 40)
Motorcycle_Share (0, 40)
Bus_Share (0, 40)
Train_Share (1, 40)
Bike_Share (77, 40)
Walk_Share (52, 40)
Tele_Share (138, 40)
CWW_Share (0, 40)
Used_Ferry_Share (0, 40)
Boarded_Ferry_Share (0, 40)
Other_Share (97, 40)
num_employees_at_worksite (29, 40)
num_employees_arrive_6-9_am (34, 40)
num_employees_using_ORCA_business (0, 40)
num_employees_using_other_mass_transit_subsidy (0, 40)
num_employees_using_vanpool_carshare_subsidy (0, 40)
num_employees_using_drive_alone_parking_subsidy (0, 40)
num_employees_using_bike_subsidy (188, 40)
num_employees_using_other_transportation_subsidy

In [21]:
from collections import Counter
outlier_indices = [list(out[1].index) for out in outliers]
outlier_indices = [outlier_index for sublist in outlier_indices for outlier_index in sublist]

cn = Counter(outlier_indices)

outlier_counts = pd.DataFrame([{'id': key, 'count': val} for key, val in cn.items()])
outlier_counts.set_index('id', inplace=True)
outlier_counts.sort_values('count', ascending=False, inplace=True)
outlier_counts[outlier_counts['count'] > 4].count()

count    31
dtype: int64

In [22]:
(31/1038)*100

2.9865125240847785

In [23]:
outliers = outlier_counts[outlier_counts['count'] > 4]
outliers

Unnamed: 0_level_0,count
id,Unnamed: 1_level_1
660,8
463,6
734,6
657,6
658,6
639,5
178,5
94,5
736,5
107,5


In [24]:
outlier_indices = list(outliers.index)

In [25]:
removed_outliers_full_df = commute_dummies.drop(outlier_indices)
removed_outliers_full_df.shape

(1007, 1159)

In [26]:
numeric_removed_outliers_df = numeric_deskewed_scaled_df.drop(outlier_indices)
numeric_removed_outliers_df.shape

(1007, 40)

In [27]:
removed_outliers_raw_df = commute_df.drop(outlier_indices)
removed_outliers_raw_df.shape

(1007, 86)

## 5. Pickling and Saving

In [28]:
# full dataframe, deskewed and scaled, with removed outliers.
removed_outliers_full_df.to_pickle('./data/removed_outliers_full_df.pkl')

# numeric dataframe, deskewed and scaled, with removed outliers
numeric_removed_outliers_df.to_pickle('./data/numeric_removed_outliers_df.pkl')

# raw dataframe with removed outliers to deskew and scale again.
removed_outliers_raw_df.to_pickle('./data/removed_outliers_raw_df.pkl')