In [1]:
# gathering data from a pikled file (dictionary of dictionaries)
import joblib
import numpy as np

enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))

In [15]:
# count the number of keys in enron_data dictionary
print("Enron dataset has %d data points (people)" % (len(enron_data)))

Enron dataset has 146 data points (people)


In [29]:
# count the number of features each key (person) has
print("Each person has %d features" % len(enron_data['METTS MARK']))

Each person has 21 features


In [41]:
# count how many persons of interest (POI) we have
pois = [x for x, y in enron_data.items() if y['poi']]  
print("There are %d POIs" % len(pois))

There are 18 POIs


In [54]:
# get poi names
poi_txt = "../final_project/poi_names.txt"
poi_names = open(poi_txt, 'r')
n_names = 0
for line in poi_names:
    #print(line[4:])
    n_names += 1
print("There are %d names in poi txt file" % (n_names - 2))

There are 35 names in poi txt file


In [59]:
# query the total value of stock belonging to James Prentice
enron_data['PRENTICE JAMES']['total_stock_value']

1095040

In [62]:
# query the number of messages sent from Wesley Colwell to POIs
enron_data['COLWELL WESLEY']['from_this_person_to_poi']

11

In [70]:
# query the total sum of stock options of Jeffrey K Skilling
enron_data['SKILLING JEFFREY K']['exercised_stock_options']

19250000

In [72]:
enron_data['FASTOW ANDREW S']['total_payments']

2424083

In [74]:
enron_data['LAY KENNETH L']['total_payments']

103559793

In [75]:
enron_data['SKILLING JEFFREY K']['total_payments']

8682716

In [78]:
# count how many data entries have salary info
count_has_salary = 0
for person, features in enron_data.items():
    if features['salary'] != 'NaN':
        count_has_salary += 1
print(count_has_salary)

95


In [80]:
# count how many data entries have email address info
count_has_email = 0
for person, features in enron_data.items():
    if features['email_address'] != 'NaN':
        count_has_email += 1
print(count_has_email)

111


In [82]:
# preparing data to be fed to scikit-learn
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False ):
    """ convert dictionary to numpy array of features
        remove_NaN=True will convert "NaN" string to 0.0
        remove_all_zeroes=True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes=True will omit any data points for which
            any of the features you seek are 0.0
    """


    return_list = []

    for key in dictionary.keys():
        tmp_list = []
        append = False
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            all_zeroes = True
            for item in tmp_list:
                if item != 0 and item != "NaN":
                    append = True

        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            any_zeroes = False
            if 0 in tmp_list or "NaN" in tmp_list:
                append = False
        if append:
            return_list.append( np.array(tmp_list) )


    return np.array(return_list)


def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features

In [99]:
total_no_payment = 0
for person, features in enron_data.items():
    if features['total_payments'] == 'NaN':
        total_no_payment += 1
print("{0} no payments info".format(total_no_payment))
print("{0} entries have no total payments info".format(float(total_no_payment) / len(enron_data)))

21 no payments info
0.14383561643835616 entries have no total payments info


In [98]:
total_no_payment = 0
for person, features in enron_data.items():
    if features['total_payments'] == 'NaN' and features['poi'] == True:
        total_no_payment += 1
print("{0} entries have no total payments info".format(float(total_no_payment) / len(enron_data)))

0.0 entries have no total payments info
