In [None]:
import pandas as pd
from sklearn import preprocessing

### Data pre-processing

Only alter the data pre-processing code if you have completed the challenge for that section.

In [None]:
# The data URI
csv_file_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "target"
]


data_original = pd.read_csv(csv_file_uri, names=column_names, index_col=False)

USE_LABEL_ENCODER = False


if USE_LABEL_ENCODER:

    # Make a copy so that we always have the original data to refer to
    data = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data.drop(["fnlwgt"], axis=1, inplace=True)

    # Create a function that changes the text to a simple binary value
    def convert_target_variable(text):
        if text == " <=50K":
            return 0
        else:
            return 1

    data["target"] = data.target.apply(convert_target_variable)

    encoded_columns = []
    for c in data.columns:
        if data[c].dtype == "object":
            if "{}_encoded".format(c) not in data.columns:
                encoder = preprocessing.LabelEncoder()
                data["{}_encoded".format(c)] = encoder.fit_transform(data[c].values)
                encoded_columns.append(c)
                encoder = None
            else:
                print("{}_encoded already exists".format(c))

    print("Dropping the encoded columns {}".format(encoded_columns))
    data.drop(encoded_columns, axis=1, inplace=True)
    
else:
    
    # Make a copy so that we always have the original data to refer to
    data_pre_dummies = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data_pre_dummies.drop(["fnlwgt"], axis=1, inplace=True)
    
    data = pd.get_dummies(data_pre_dummies)

    # Deletes the original column in this dataframe.
    data.drop(["target_ <=50K"], axis=1, inplace=True)

    # Rename the target
    data.rename(columns={'target_ >50K': 'target' }, inplace=True)

In [None]:
def anything(string):
    # replace the $ with nothing
    
    return string.replace("$","")

print(anything("$1.0"))

---
### Model

This is the moment we can use the new, numerical, data to plug it into pretty much any classification model. First we'll convert the data to a matrix with our features - that is the data that we want to use to predict from - and an array with our labels - the target variable that indicates if someone makes more than 50k or not.

In [None]:
feature_columns = data.columns.tolist()
feature_columns.remove("target")

In [None]:
X = data[feature_columns].values
y = data["target"].values

print(X.shape)
print(y.shape)

In [None]:
# Quick tips on how to index matrices/arrays

# The first ROW
print(X[0,:])

# The first TWO ROWS
print(X[:2,:])

# The last ROW
print(X[-1,:])

# The first 3 ROWS with only the last TWO COLUMNS
print(X[:3,-2:])

Lets use a model from **scikit-learn**: LogisticRegression

For those interested in the [documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [None]:
from sklearn import linear_model

# Create linear regression object
clf = linear_model.LogisticRegression()

In [None]:
# MAGIC
# 
# Train the model using the training sets
clf.fit(X, y)

In [None]:
IX = -1

# Make predictions using the testing set. 
# For now we'll use the last value of the training set.
pred = clf.predict(X[IX,:].reshape(1,-1))

# Print the data 
print(data_original.iloc[data_original.index[IX]])

# and the prediction
print("Predict its a {}".format(pred))

probability = clf.predict_proba(X[IX,:].reshape(1,-1))
print("With a probability of {}".format(probability[0,pred[0]]))

In [None]:
# How good is the model by evaluating it on the training set
print("This model has an overall accuracy of {}".format(clf.score(X, y)))

In [None]:
# How did the model do this? We can see the coefficients 
# for each column using 
# print(clf.coef_)

print("The average coefficient has a value of {}".format(clf.coef_[0].mean()))
print("The max and min coefficient are {} and {}".format(clf.coef_[0].max(), clf.coef_[0].min()))
print("The standard deviation of the coefficients is {}".format(clf.coef_[0].std()))

In [None]:
mean_coef = clf.coef_[0].mean()
std_coef = clf.coef_[0].std()
max_coef = clf.coef_[0].max()
min_coef = clf.coef_[0].min()

# Pretty print with the column names
for ix, c in enumerate(feature_columns):
    # I am only going to print the coefficients that are very extreme
    # (i.e., close to the min/max)
    # lets get the variables in a handy way
    this_coef = clf.coef_[0][ix]
    if abs((this_coef-mean_coef)/std_coef) > 1.5:
        print("Column {} is {}".format(c, clf.coef_[0][ix]))

**The above is incredibly bad practice (up to the point where I should be fired for even showing you). Why?**