In [1]:
import pandas as pd
from sklearn import preprocessing

### Data pre-processing

Only alter the data pre-processing code if you have completed the challenge for that section.

In [2]:
# The data URI
csv_file_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "target"
]


data_original = pd.read_csv(csv_file_uri, names=column_names, index_col=False)

USE_LABEL_ENCODER = False


if USE_LABEL_ENCODER:

    # Make a copy so that we always have the original data to refer to
    data = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data.drop(["fnlwgt"], axis=1, inplace=True)

    # Create a function that changes the text to a simple binary value
    def convert_target_variable(text):
        if text == " <=50K":
            return 0
        else:
            return 1

    data["target"] = data.target.apply(convert_target_variable)

    encoded_columns = []
    for c in data.columns:
        if data[c].dtype == "object":
            if "{}_encoded".format(c) not in data.columns:
                encoder = preprocessing.LabelEncoder()
                data["{}_encoded".format(c)] = encoder.fit_transform(data[c].values)
                encoded_columns.append(c)
                encoder = None
            else:
                print("{}_encoded already exists".format(c))

    print("Dropping the encoded columns {}".format(encoded_columns))
    data.drop(encoded_columns, axis=1, inplace=True)
    
else:
    
    # Make a copy so that we always have the original data to refer to
    data_pre_dummies = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data_pre_dummies.drop(["fnlwgt"], axis=1, inplace=True)
    
    data = pd.get_dummies(data_pre_dummies)

    # Deletes the original column in this dataframe.
    data.drop(["target_ <=50K"], axis=1, inplace=True)

    # Rename the target
    data.rename(columns={'target_ >50K': 'target' }, inplace=True)

In [3]:
def anything(string):
    # replace the $ with nothing
    
    return string.replace("$","")

print(anything("$1.0"))

1.0


---
### Model

This is the moment we can use the new, numerical, data to plug it into pretty much any classification model. First we'll convert the data to a matrix with our features - that is the data that we want to use to predict from - and an array with our labels - the target variable that indicates if someone makes more than 50k or not.

In [4]:
feature_columns = data.columns.tolist()
feature_columns.remove("target")

In [5]:
X = data[feature_columns].values
y = data["target"].values

print(X.shape)
print(y.shape)

(32561, 107)
(32561,)


In [6]:
# Quick tips on how to index matrices/arrays

# The first ROW
print(X[0,:])

# The first TWO ROWS
print(X[:2,:])

# The last ROW
print(X[-1,:])

# The first 3 ROWS with only the last TWO COLUMNS
print(X[:3,-2:])

[  39   13 2174    0   40    0    0    0    0    0    0    0    1    0
    0    0    0    0    0    0    0    0    0    1    0    0    0    0
    0    0    0    0    0    0    1    0    0    0    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    1    0    0
    0    0    0    0    0    0    1    0    1    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    1    0    0]
[[  39   13 2174    0   40    0    0    0    0    0    0    0    1    0
     0    0    0    0    0    0    0    0    0    1    0    0    0    0
     0    0    0    0    0    0    1    0    0    0    1    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    1    0    0
     0    0    0    0    0    0    1    0    1    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0   

Lets use a model from **scikit-learn**: LogisticRegression

For those interested in the [documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [7]:
from sklearn import linear_model

# Create linear regression object
clf = linear_model.LogisticRegression()

In [8]:
# MAGIC
# 
# Train the model using the training sets
clf.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
IX = -1

# Make predictions using the testing set. 
# For now we'll use the last value of the training set.
pred = clf.predict(X[IX,:].reshape(1,-1))

# Print the data 
print(data_original.iloc[data_original.index[IX]])

# and the prediction
print("Predict its a {}".format(pred))

probability = clf.predict_proba(X[IX,:].reshape(1,-1))
print("With a probability of {}".format(probability[0,pred[0]]))

age                                52
workclass                Self-emp-inc
fnlwgt                         287927
education                     HS-grad
education-num                       9
marital-status     Married-civ-spouse
occupation            Exec-managerial
relationship                     Wife
race                            White
sex                            Female
capital-gain                    15024
capital-loss                        0
hours-per-week                     40
native-country          United-States
target                           >50K
Name: 32560, dtype: object
Predict its a [1]
With a probability of 0.9941014610337585


In [10]:
# How good is the model by evaluating it on the training set
print("This model has an overall accuracy of {}".format(clf.score(X, y)))

This model has an overall accuracy of 0.8512330702374006


In [11]:
# How did the model do this? We can see the coefficients 
# for each column using 
# print(clf.coef_)

print("The average coefficient has a value of {}".format(clf.coef_[0].mean()))
print("The max and min coefficient are {} and {}".format(clf.coef_[0].max(), clf.coef_[0].min()))
print("The standard deviation of the coefficients is {}".format(clf.coef_[0].std()))

The average coefficient has a value of -0.20262863513587398
The max and min coefficient are 1.0598425080570655 and -1.8116655706602889
The standard deviation of the coefficients is 0.4308833318837741


In [12]:
mean_coef = clf.coef_[0].mean()
std_coef = clf.coef_[0].std()
max_coef = clf.coef_[0].max()
min_coef = clf.coef_[0].min()

# Pretty print with the column names
for ix, c in enumerate(feature_columns):
    # I am only going to print the coefficients that are very extreme
    # (i.e., close to the min/max)
    # lets get the variables in a handy way
    this_coef = clf.coef_[0][ix]
    if abs((this_coef-mean_coef)/std_coef) > 1.5:
        print("Column {} is {}".format(c, clf.coef_[0][ix]))

Column education_ Doctorate is 0.672107352478155
Column education_ Prof-school is 0.7134442571547988
Column marital-status_ Married-civ-spouse is 0.5632290551855195
Column marital-status_ Never-married is -1.2582861443544797
Column occupation_ Exec-managerial is 0.6061721917361638
Column occupation_ Farming-fishing is -1.0360027590408043
Column occupation_ Other-service is -1.03528273652643
Column occupation_ Tech-support is 0.5255208514307834
Column relationship_ Own-child is -1.4832499622264714
Column relationship_ Wife is 1.0598425080570655
Column sex_ Female is -1.8116655706602889
Column sex_ Male is -0.9239603834974577


**The above is incredibly bad practice (up to the point where I should be fired for even showing you). Why?**