## enron_outliers.py

In [None]:
%matplotlib inline

import math
import pickle
import sys

import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb") )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)

for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

## outlier_cleaner.py

In [None]:
def outlier_cleaner(predictions, ages, net_worths):
    """
        Clean away the 10% of points that have the largest
        residual errors (difference between the prediction
        and the actual net worth).

        Return a list of tuples named cleaned_data where 
        each tuple is of the form (age, net_worth, error).
    """
    
    residual_errors = abs(net_worths-predictions)
    
    max_idx_to_take = math.floor(residual_errors.size * 0.9) # size works because its a 1d array
    cleaned_data = [(age, net_worth, error) 
                    for age, net_worth, error 
                    in zip(ages.flatten(), net_worths.flatten(), residual_errors.flatten())]
    cleaned_data.sort(key=lambda x: x[2], reverse=False)
    cleaned_data = [(age, net_worth, error) 
                    for idx, (age, net_worth, error)
                    in enumerate(cleaned_data) if idx <= max_idx_to_take]
    ### your code goes here
    return cleaned_data

In [None]:
import numpy as np

def outlier_cleaner_iqr(predictions, ages, net_worths):
    """
        This one does the same but uses the IQR * 1.5 that a stats 101 class will teach.

        Return a list of tuples named cleaned_data where 
        each tuple is of the form (age, net_worth, error).
    """
    residual_errors = abs(net_worths-predictions)
    
    upper_quartile = np.percentile(residual_errors, 75)
    lower_quartile = np.percentile(residual_errors, 25)
    IQR = (upper_quartile - lower_quartile) * 1.5
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    
    cleaned_data = [(age, net_worth, error) 
                    for age, net_worth, error 
                    in zip(ages.flatten(), net_worths.flatten(), residual_errors.flatten())
                    if error >= quartileSet[0] and error <=quartileSet[1]]

    ### your code goes here
    return cleaned_data

## outlier_removal_regression.py

In [None]:
%matplotlib inline

import random

import numpy
import matplotlib.pyplot as plt
import pickle
from sklearn import linear_model

# CAL: Don't need this since I'm putting everything in one notebook
# from outlier_cleaner import outlierCleaner

### load up some practice data with outliers in it
ages = pickle.load( open("practice_outliers_ages.pkl", "rb") )
net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "rb") )

### ages and net_worths need to be reshaped into 2D numpy arrays
### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
### by convention, n_rows is the number of data points
### and n_columns is the number of features
ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

# CHANGED
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages,
                                                                            net_worths,
                                                                            test_size=0.1,
                                                                            random_state=42)

### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like
reg = linear_model.LinearRegression().fit(ages_train, net_worths_train)
try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    print("NameError")
plt.scatter(ages, net_worths)
plt.show()

## Uses method that's covered in udacity

In [None]:
%matplotlib inline

### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlier_cleaner( predictions, ages_train, net_worths_train )
except NameError:
    print("your regression object doesn't exist, or isn't name reg")
    print("can't make predictions to use in identifying outliers")
    
### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = list(zip(*cleaned_data))
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

    ### refit your cleaned data!
    try:
        reg.fit(ages, net_worths)
        plt.plot(ages, reg.predict(ages), color="blue")
    except NameError:
        print("you don't seem to have regression imported/created,")
        print("   or else your regression object isn't named reg")
        print("   either way, only draw the scatter plot of the cleaned data")
    plt.scatter(ages, net_worths)
    plt.xlabel("ages")
    plt.ylabel("net worths")
    plt.show()
else:
    print("outlier_cleaner() is returning an empty list, no refitting to be done")

## Uses IQR filtering method

In [None]:
%matplotlib inline

### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlier_cleaner_iqr( predictions, ages_train, net_worths_train )
except NameError:
    print("your regression object doesn't exist, or isn't name reg")
    print("can't make predictions to use in identifying outliers")
    
### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = list(zip(*cleaned_data))
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

    ### refit your cleaned data!
    try:
        reg.fit(ages, net_worths)
        plt.plot(ages, reg.predict(ages), color="blue")
    except NameError:
        print("you don't seem to have regression imported/created,")
        print("   or else your regression object isn't named reg")
        print("   either way, only draw the scatter plot of the cleaned data")
    plt.scatter(ages, net_worths)
    plt.xlabel("ages")
    plt.ylabel("net worths")
    plt.show()
else:
    print("outlier_cleaner() is returning an empty list, no refitting to be done")