# A New Enron Feature Quiz

In [5]:
import os
import zipfile
import reader
import poi_emails

In [6]:
def getToFromStrings(f):
    '''
    The imported reader.py file contains functions that we've created to help
    parse e-mails from the corpus. .getAddresses() reads in the opening lines
    of an e-mail to find the To: From: and CC: strings, while the
    .parseAddresses() line takes each string and extracts the e-mail addresses
    as a list.
    '''
    f.seek(0)
    to_string, from_string, cc_string   = reader.getAddresses(f)
    to_emails   = reader.parseAddresses( to_string )
    from_emails = reader.parseAddresses( from_string )
    cc_emails   = reader.parseAddresses( cc_string )

    return to_emails, from_emails, cc_emails


def poiFlagEmail(f):
    """ given an email file f,
        return a trio of booleans for whether that email is
        to, from, or cc'ing a poi """

    to_emails, from_emails, cc_emails = getToFromStrings(f)

    ### poi_emails.poiEmails() returns a list of all POIs' email addresses.
    poi_email_list = poi_emails.poiEmails()

    to_poi = False
    from_poi = False
    cc_poi   = False

    ### to_poi and cc_poi are boolean variables which flag whether the email
    ### under inspection is addressed to a POI, or if a POI is in cc,
    ### respectively. You don't have to change this code at all.

    ### There can be many "to" emails, but only one "from", so the
    ### "to" processing needs to be a little more complicated
    if to_emails:
        ctr = 0
        while not to_poi and ctr < len(to_emails):
            if to_emails[ctr] in poi_email_list:
                to_poi = True
            ctr += 1

    if cc_emails:
        ctr = 0
        while not cc_poi and ctr < len(cc_emails):
            if cc_emails[ctr] in poi_email_list:
                cc_poi = True
            ctr += 1

    #################################
    ######## your code below ########
    ### set from_poi to True if #####
    ### the email is from a POI #####
    #################################
    if from_emails:
        ctr = 0
        while not from_poi and ctr < len(from_emails):
            if from_emails[ctr] in poi_email_list:
                from_poi = True
            ctr += 1

    #################################
    return to_poi, from_poi, cc_poi

# Visualizing Your New Feature

In [3]:
import pickle
from get_data import getData

def computeFraction( poi_messages, all_messages ):
    """ given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
    """
    if all_messages == 'NaN' or poi_messages == 'NaN':
        fraction = 0.0
    else:
        fraction = float(poi_messages) / float(all_messages)
    
    return fraction

data_dict = getData() 
submit_dict = {}

for name in data_dict:

    data_point = data_dict[name]

    print
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    print fraction_from_poi
    data_point["fraction_from_poi"] = fraction_from_poi


    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    print fraction_to_poi
    submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
                       "from_this_person_to_poi":fraction_to_poi}

    data_point["fraction_to_poi"] = fraction_to_poi
    
    
#####################

def submitDict():
    return submit_dict



0.0470879801735
0.0344827586207

0.0
0.0

0.0
0.0

0.0130890052356
0.0

0.0306220095694
0.65625

0.0
0.0

0.0
0.0

0.0246284501062
0.541666666667

0.0187234042553
0.0139794967381

0.0492730210016
0.216216216216

0.078125
1.0

0.108108108108
0.0

0.010101010101
0.142857142857

0.013978088402
0.342105263158

0.0
0.0

0.0
0.0

0.136518771331
0.275

0.0882352941176
0.0

0.0968992248062
0.339285714286

0.0
0.0

0.0
0.0

0.0253353204173
0.0

0.0302227573751
0.112268518519

0.0
0.0

0.0
0.0

0.0291834833903
0.0309585975382

0.0
0.0

0.0104438642298
0.0

0.0
0.0

0.014312383323
0.2

0.0263554216867
0.585365853659

0.0
0.0

0.0196855775803
0.0888786553074

0.00698080279232
0.368421052632

0.0
0.0

0.0
0.0

0.030303030303
0.037037037037

0.0
0.0

0.0105042016807
0.0

0.031746031746
0.0

0.0174459176553
0.0093023255814

0.0
0.0

0.0237420269313
0.222222222222

0.072737291638
0.158994197292

0.0
0.0

0.0
0.0

0.0
0.0

0.0
0.0

0.0
0.0

0.0689045936396
0.0

0.0
0.0

0.00488481087861
0.053497942386

# Number of Features and Overfitting

In [7]:
import pickle
import numpy

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
numpy.random.seed(42)

words_file = "your_word_data.pkl" 
authors_file = "your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )

### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

### your code goes here


In [27]:
print len(features_train)

150


# Accuracy of Your Overfit Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [29]:
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = accuracy_score(labels_test, pred)
print acc
print clf.score(features_test, labels_test)

1.0
1.0


_Note: This is not the accuracy the grader was looking for, but I couldn't get the correct answer of `0.948236632537`_

# Identify the Most Powerful Features

In [33]:
for index, feature in enumerate(clf.feature_importances_):
    if feature > 0.0:
        print "feature no", index 
        print "importance", feature
        print "word", words[index]

_Note: At this point, I had to give up on the mini project. Something must have gone wrong in an earlier lesson._