In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # advanced plotting library
#
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

from tensorflow import keras

In [57]:
# Additional functions
def split_labels(data, label_feature):
    """
    Split the given column of of the data, returning the full data set (without that
    feature) and the split off feature.
    """
    return data.drop(columns=label_feature), data[label_feature]

In [58]:
# Column Transformers
class DropBadRowsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, column_to_drop_from, value_to_drop):
        super().__init__()
        self.column_to_drop_from = column_to_drop_from
        self.value_to_drop = value_to_drop

    def fit(self, X, y=None):
        rows_to_drop = X[X[self.column_to_drop_from] != self.value_to_drop].index
        X.drop(rows_to_drop, inplace=True)
        y.drop(rows_to_drop, inplace=True)
        return self

    def transform(self, X, y=None):
        return X

class ProtectXy(BaseEstimator, TransformerMixin):
    """
    Due to all the sketchy stuff the other transformers do in fit(), we
    need a way to protect the fact that those datasets are constantly
    changing but we don't want to change the global datasets.

    All of this is to work around the limitation that the transform()
    method doesn't have a y parameter...
    """
    def __init__(self):
        super().__init__()
        self.X_ref_ = None
        self.y_ref_ = None

    def fit(self, X, y=None, **kwargs):
        if self.X_ref_ is X and self.y_ref_ is y:
            X._update_inplace(self.X_copy_)
            y._update_inplace(self.y_copy_)
        else:
            self.X_ref_, self.y_ref_ = X, y
            self.X_copy_, self.y_copy_ = X.copy(), y.copy()
        return self

    def transform(self, X): return X  # due to the need for modifying and using the y variable, the work is done above

# class DropColumnsTransformer(BaseEstimator, TransformerMixin):

#     def __init__(self, columns_to_drop):
#         super().__init__()
#         self.columns_to_drop = columns_to_drop

#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         # Drop the columns that we want to drop
#         for column in self.columns_to_drop:
#             X.drop(column, axis=1, inplace=True)
#         return X


class ConvertRealClassificationValuesToHumanReadableStrings(BaseEstimator, TransformerMixin):
    
    def __init__(self, label_map=None):
        super().__init__()
        self.label_map = label_map
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Replace 'real' scores with their categorical string
        return X.applymap(lambda x: self.label_map[x])

    
class ClassifierToNumericalValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_apply_to=None, classifications_kept=None):
        super().__init__()
        # Transform_columns is a list of column header strings to which to apply the transformation
        # Classifications kept is a list of strings correlating to the classification level
        self.features_to_apply_to = features_to_apply_to
        self.classifications_kept = classifications_kept

    def fit(self, X, y=None, **kwargs):
        # for feature in self.features_to_apply_to:
        #     if y is not None and feature in y.columns:
        #         y[feature] = y[feature].apply(lambda x: 1 if x in self.classifications_kept else 0)
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for feature in self.features_to_apply_to:
            X[feature] = X[feature].apply(lambda x: 1 if x in self.classifications_kept else 0)
        return X

    
class AddNewFeatureFromSeveralLabelsTransformer(BaseEstimator, TransformerMixin):
    """
    # data_headers_modded['Hallucinogenic User'] = np.any([(data_headers_modded[column] == 1) for column in ['Ketamine', 'LSD', 'Mushrooms']], axis=0)
    """
    def __init__(self, new_column, several_labels):
        super().__init__()
        self.new_column = new_column
        self.several_labels = several_labels

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        print(self.new_column)
        X[self.new_column] = np.any([(X[label] == 1) for label in self.several_labels], axis=0)
        return X


# class IsCasualDrugUserTransformer(BaseEstimator, TransformerMixin):
#     """
#     """
#     def __init__(self, several_labels):
#         super().__init__()
#         self.several_labels = several_labels

#     def fit(self, X, y=None, **kwargs):
#         return self

#     def transform(self, X, y=None):
#         # Find all non-hard drug users who use casual drugs
#         X['casual_drug_user'] = np.any([(X[label] == 1) for label in self.several_labels], axis=0) & ~X['hard_drug_user']
#         return X


class IsCollegeEducated():
    def __init__(self):
        super().__init__()
        self.college_education = ['Bachelor\'s', 'Master\'s', 'Ph.D.']

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        return X.isin(self.college_education)

class IsHighSchoolDropout():
    def __init__(self):
        super().__init__()
        # TODO: add to the assumptions that early graduations go on to become college educated
        self.dropout_years = ['<16 yrs old', '16 yrs old', '17 yrs old']

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        return X.isin(self.dropout_years)

In [59]:
# Fix the display
pd.set_option('max_columns', None)
# Load in the data and fix the header names
data = pd.read_csv('drug_consumption_with_headers.data')
data = data.rename(columns=lambda x: x.strip()) # Remove extra spaces from headers
# Split off the label we are trying to predict
X, y = split_labels(data, 'Heroin')
# Convert the y labels to a numerical value
y = y.isin(['CL3', 'CL4', 'CL5', 'CL6']).astype(float)

# Split off the training set from the testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# y_train = y_train.isin(['CL3', 'CL4', 'CL5', 'CL6'])

In [60]:
drug_labels = ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 
              'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', #'Heroin',
              'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 
              'Nicotine', 'VSA']
# 'CL0' is the lowest classification level, 'CL6' is the highest
# CL0 Never Used
# CL1 Used over a Decade Ago
# CL2 Used in Last Decade 
# CL3 Used in Last Year 
# CL4 Used in Last Month
# CL5 Used in Last Week 
# CL6 Used in Last Day
user_classifications =  ['CL3', 'CL4', 'CL5', 'CL6']
label_map_age = {
            -0.95197: '18-24', 
            -0.07854: '25-34', 
            0.49788: '35-44', 
            1.09449: '45-54', 
            1.82213: '55-64', 
            2.59171: '65+'
}
label_map_edu = {
            -2.43591: '<16 yrs old',
            -1.73790: '16 yrs old',
            -1.43719: '17 yrs old',
            -1.22751: '18 yrs old',
            -0.61113: 'Some college or uni., no cert. or degree',
            -0.05921: 'Prof. cert./diploma',
            0.45468: 'Bachelor\'s',
            1.16365: 'Master\'s',
            1.98437: 'Ph.D.'
}
label_map_gender = {
    -0.48246: 0,
    0.48246: 1
}
edu_labels = list(label_map_edu.values())
age_labels = list(label_map_age.values())
all_labels = X_train.columns

In [61]:
labels_personality = ['Nscore (Real) [neuroticism]',
       'Escore (Real) [Extraversion]',
       'Oscore (Real) [Openness to experience]',
       'Ascore (Real) [Agreeableness]', 
       'Cscore (Real) [Conscientiousness]',
       'Impulsive (Real)', 
       'SS (Real) [sensation seeing (sic)]']

labels_remaining =  ['is_college_educated', 'is_high_school_dropout'] + \
              list(label_map_edu.values()) + list(label_map_age.values()) + \
              [ 'Gender (Real)', 'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 
              'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', 'Ketamine', 'Legalh', 
              'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'VSA' ] + labels_personality

In [62]:
# Create a pipeline that drops any samples with a 'Semer' value that is not CL0 and then drops the 'Semer' column
pipeline = Pipeline(steps=[
    ('protect_xy', ProtectXy()),
    ('drop_semer_samples', DropBadRowsTransformer('Semer', 'CL0')),
    ('col_trans', ColumnTransformer(transformers=[
        ('drop_unneeded', 'drop', ['Semer', 'ID',  'Country (Real)', 'Ethnicity (Real)']),
        ('edu_feature_pl_college_educated', Pipeline(steps=[
            ('convert_edu_real_values_to_human_readable_string_1', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('add_new_feature_college_educated', IsCollegeEducated()),
        ]), ['Education (Real)']),
        ('edu_features_pl_highschool_dropout', Pipeline(steps=[
            ('convert_edu_real_values_to_human_readable_string_1', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('add_new_feature_highschool_dropout', IsHighSchoolDropout()),
        ]), ['Education (Real)']),
        ('edu_encoding_pl', Pipeline([
            ('convert_edu_real_values_to_human_readable_string_2', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('one_hot_encoding_edu', OneHotEncoder(drop=None, categories=[edu_labels])),
        ]), ['Education (Real)']),
        ('age', Pipeline([  
            ('convert_age_real_values_to_human_readable_string', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_age)),
            ('one_hot_encoding_age', OneHotEncoder(drop=None, categories=[age_labels])),
        ]), ['Age (Real)']),
        ('gender', Pipeline(steps=[
            ('convert_gender_real_vals_to_binary', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_gender)),
        ]), ['Gender (Real)']),
        ('drug_classifications_to_users_pl', Pipeline(steps=[
            ('classification_to_num_val', ClassifierToNumericalValueTransformer(features_to_apply_to=drug_labels, classifications_kept=user_classifications))]),
        ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']),
    ], remainder='passthrough')),
])
X_trans = pd.DataFrame(pipeline.fit_transform(X_train, y_train), columns=labels_remaining)
# X_trans = pipeline.fit_transform(X_train, y_train)
X_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1501 entries, 0 to 1500
Data columns (total 42 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   is_college_educated                       1501 non-null   float64
 1   is_high_school_dropout                    1501 non-null   float64
 2   <16 yrs old                               1501 non-null   float64
 3   16 yrs old                                1501 non-null   float64
 4   17 yrs old                                1501 non-null   float64
 5   18 yrs old                                1501 non-null   float64
 6   Some college or uni., no cert. or degree  1501 non-null   float64
 7   Prof. cert./diploma                       1501 non-null   float64
 8   Bachelor's                                1501 non-null   float64
 9   Master's                                  1501 non-null   float64
 10  Ph.D.                               

In [63]:
# data = np.array(X_trans.copy()).astype(np.float32)
# data_y = np.array(y_train.copy()).astype(np.float32)
# # import tensorflow as tf
# # X_train = tf.convert_to_tensor(data)
# # y_train = tf.convert_to_tensor(data_y)
# # Jeff took 5000/60000, 1/12 of the original for his validation set. We will take ~1/12 of 1501, 125
# X_valid, X_train_subset = data[:125], data[125:]
# y_valid, y_train_subset = data_y[:125], data_y[125:]

# # Create the model using 2 hidden layers with 300 and 100 neurons using ReLU
# model = keras.models.Sequential()
# model.add(keras.layers.Flatten(input_shape=X_train_subset[0].shape, name="input"))
# model.add(keras.layers.BatchNormalization()) # Add after very layer
# model.add(keras.layers.Dense(300, activation=keras.activations.relu, kernel_initializer="he_normal", name="hidden-1"))
# model.add(keras.layers.BatchNormalization()) # Batch
# model.add(keras.layers.Dense(100, activation=keras.activations.relu, name="hidden-2"))
# model.add(keras.layers.BatchNormalization()) # Batch
# model.add(keras.layers.Dense(10, activation=keras.activations.softmax, name="output"))

# # Compile the model using the SGD optimizer (LR=0.01 - the default), categorical cross-entropy loss, and an accuracy metric
# model.compile(optimizer=keras.optimizers.SGD(lr=0.01), 
#               loss=keras.losses.sparse_categorical_crossentropy, 
#               metrics=['accuracy']
#               )

# # Fit the model to the training data using 10 epochs
# model.fit(X_train_subset, y_train_subset, epochs=10, validation_data=(X_valid, y_valid))

# # Evaluate the testing performance
# model.evaluate(X_test, y_test)

In [64]:
# Cell above keeps breakingn so I'm trying this a different way
data = np.array(X_trans.copy()).astype(np.float32)
data_y = np.array(y_train.copy()).astype(np.float32)
# Jeff took 5000/60000, 1/12 of the original for his validation set. We will take ~1/12 of 1501, 125
X_valid, X_train_subset = data[:125], data[125:]
y_valid, y_train_subset = data_y[:125], data_y[125:]

# Create the layers for the deep and wide model
input_ = keras.layers.Input(shape=X_train_subset[0].shape)
hidden1 = keras.layers.Dense(300, activation="relu")
hidden2 = keras.layers.Dense(100, activation="relu")
concat = keras.layers.Concatenate()
output_layer = keras.layers.Dense(10)

# Create the connections between the layers
output = hidden1(input_)
output = hidden2(output)
output = concat([input_, output])
output = output_layer(output)

# Create the model
model = keras.Model(inputs=[input_], outputs=[output])

# Compile the model
model.compile(optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              loss=keras.losses.sparse_categorical_crossentropy, 
              metrics=['accuracy'])

# Fit the model
history = model.fit(X_train_subset, y_train_subset, validation_data=(X_valid, y_valid), epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [65]:
def build_model(n_neurons=100, n_hidden_layers=3, learning_rate=0.01):
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=X_train[0].shape))
    for _ in range(n_hidden_layers):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.Dense(1))

    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
                  loss="mse")

    # This method must return the compiled, but not fit, model
    return model