In [228]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # advanced plotting library
#
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

In [229]:
# Additional functions
def split_labels(data, label_feature):
    """
    Split the given column of of the data, returning the full data set (without that
    feature) and the split off feature.
    """
    return data.drop(columns=label_feature), data[label_feature]

In [235]:
# Column Transformers
class DropBadRowsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, column_to_drop_from, value_to_drop):
        super().__init__()
        self.column_to_drop_from = column_to_drop_from
        self.value_to_drop = value_to_drop

    def fit(self, X, y=None):
        rows_to_drop = X[X[self.column_to_drop_from] != self.value_to_drop].index
        X.drop(rows_to_drop, inplace=True)
        y.drop(rows_to_drop, inplace=True)
        return self

    def transform(self, X, y=None):
        return X

class ProtectXy(BaseEstimator, TransformerMixin):
    """
    Due to all the sketchy stuff the other transformers do in fit(), we
    need a way to protect the fact that those datasets are constantly
    changing but we don't want to change the global datasets.

    All of this is to work around the limitation that the transform()
    method doesn't have a y parameter...
    """
    def __init__(self):
        super().__init__()
        self.X_ref_ = None
        self.y_ref_ = None

    def fit(self, X, y=None, **kwargs):
        if self.X_ref_ is X and self.y_ref_ is y:
            X._update_inplace(self.X_copy_)
            y._update_inplace(self.y_copy_)
        else:
            self.X_ref_, self.y_ref_ = X, y
            self.X_copy_, self.y_copy_ = X.copy(), y.copy()
        return self

    def transform(self, X): return X  # due to the need for modifying and using the y variable, the work is done above

# class DropColumnsTransformer(BaseEstimator, TransformerMixin):

#     def __init__(self, columns_to_drop):
#         super().__init__()
#         self.columns_to_drop = columns_to_drop

#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         # Drop the columns that we want to drop
#         for column in self.columns_to_drop:
#             X.drop(column, axis=1, inplace=True)
#         return X


class ConvertRealClassificationValuesToHumanReadableStrings(BaseEstimator, TransformerMixin):
    
    def __init__(self, label_map=None):
        super().__init__()
        self.label_map = label_map
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Replace 'real' scores with their categorical string
        return X.applymap(lambda x: self.label_map[x])

    
class ClassifierToNumericalValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_apply_to=None, classifications_kept=None):
        super().__init__()
        # Transform_columns is a list of column header strings to which to apply the transformation
        # Classifications kept is a list of strings correlating to the classification level
        self.features_to_apply_to = features_to_apply_to
        self.classifications_kept = classifications_kept

    def fit(self, X, y=None, **kwargs):
        # for feature in self.features_to_apply_to:
        #     if y is not None and feature in y.columns:
        #         y[feature] = y[feature].apply(lambda x: 1 if x in self.classifications_kept else 0)
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for feature in self.features_to_apply_to:
            X[feature] = X[feature].apply(lambda x: 1 if x in self.classifications_kept else 0)
        return X

    
class AddNewFeatureFromSeveralLabelsTransformer(BaseEstimator, TransformerMixin):
    """
    # data_headers_modded['Hallucinogenic User'] = np.any([(data_headers_modded[column] == 1) for column in ['Ketamine', 'LSD', 'Mushrooms']], axis=0)
    """
    def __init__(self, new_column, several_labels):
        super().__init__()
        self.new_column = new_column
        self.several_labels = several_labels

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        print(self.new_column)
        X[self.new_column] = np.any([(X[label] == 1) for label in self.several_labels], axis=0)
        return X


# class IsCasualDrugUserTransformer(BaseEstimator, TransformerMixin):
#     """
#     """
#     def __init__(self, several_labels):
#         super().__init__()
#         self.several_labels = several_labels

#     def fit(self, X, y=None, **kwargs):
#         return self

#     def transform(self, X, y=None):
#         # Find all non-hard drug users who use casual drugs
#         X['casual_drug_user'] = np.any([(X[label] == 1) for label in self.several_labels], axis=0) & ~X['hard_drug_user']
#         return X


class IsCollegeEducated():
    def __init__(self):
        super().__init__()
        self.college_education = ['Bachelor\'s', 'Master\'s', 'Ph.D.']

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        return X.isin(self.college_education)

class IsHighSchoolDropout():
    def __init__(self):
        super().__init__()
        # TODO: add to the assumptions that early graduations go on to become college educated
        self.dropout_years = ['<16 yrs old', '16 yrs old', '17 yrs old']

    def fit(self, X, y=None, **kwargs):
        return self

    def transform(self, X, y=None):
        return X.isin(self.dropout_years)

In [236]:
# Fix the display
pd.set_option('max_columns', None)
# Load in the data and fix the header names
data = pd.read_csv('drug_consumption_with_headers.data')
data = data.rename(columns=lambda x: x.strip()) # Remove extra spaces from headers
# Split off the label we are trying to predict
X, y = split_labels(data, 'Heroin')
# Split off the training set from the testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [237]:
drug_labels = ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 
              'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', #'Heroin',
              'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 
              'Nicotine', 'VSA']
# 'CL0' is the lowest classification level, 'CL6' is the highest
# CL0 Never Used
# CL1 Used over a Decade Ago
# CL2 Used in Last Decade 
# CL3 Used in Last Year 
# CL4 Used in Last Month
# CL5 Used in Last Week 
# CL6 Used in Last Day
user_classifications =  ['CL3', 'CL4', 'CL5', 'CL6']
label_map_age = {
            -0.95197: '18-24', 
            -0.07854: '25-34', 
            0.49788: '35-44', 
            1.09449: '45-54', 
            1.82213: '55-64', 
            2.59171: '65+'
}
label_map_edu = {
            -2.43591: '<16 yrs old',
            -1.73790: '16 yrs old',
            -1.43719: '17 yrs old',
            -1.22751: '18 yrs old',
            -0.61113: 'Some college or uni., no cert. or degree',
            -0.05921: 'Prof. cert./diploma',
            0.45468: 'Bachelor\'s',
            1.16365: 'Master\'s',
            1.98437: 'Ph.D.'
}
label_map_gender = {
    -0.48246: 0,
    0.48246: 1
}
edu_labels = list(label_map_edu.values())
age_labels = list(label_map_age.values())
all_labels = X_train.columns

In [238]:
labels_personality = ['Nscore (Real) [neuroticism]',
       'Escore (Real) [Extraversion]',
       'Oscore (Real) [Openness to experience]',
       'Ascore (Real) [Agreeableness]', 
       'Cscore (Real) [Conscientiousness]',
       'Impulsive (Real)', 
       'SS (Real) [sensation seeing (sic)]']

labels_remaining =  ['is_college_educated', 'is_high_school_dropout'] + \
              list(label_map_edu.values()) + list(label_map_age.values()) + \
              [ 'Gender (Real)', 'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 
              'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', 'Ketamine', 'Legalh', 
              'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'VSA' ] + labels_personality

In [240]:
# Create a pipeline that drops any samples with a 'Semer' value that is not CL0 and then drops the 'Semer' column
pipeline = Pipeline(steps=[
    ('protect_xy', ProtectXy()),
    ('drop_semer_samples', DropBadRowsTransformer('Semer', 'CL0')),
    ('col_trans', ColumnTransformer(transformers=[
        ('drop_unneeded', 'drop', ['Semer', 'ID',  'Country (Real)', 'Ethnicity (Real)']),
        ('edu_feature_pl_college_educated', Pipeline(steps=[
            ('convert_edu_real_values_to_human_readable_string_1', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('add_new_feature_college_educated', IsCollegeEducated()),
        ]), ['Education (Real)']),
        ('edu_features_pl_highschool_dropout', Pipeline(steps=[
            ('convert_edu_real_values_to_human_readable_string_1', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('add_new_feature_highschool_dropout', IsHighSchoolDropout()),
        ]), ['Education (Real)']),
        ('edu_encoding_pl', Pipeline([
            ('convert_edu_real_values_to_human_readable_string_2', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_edu)),
            ('one_hot_encoding_edu', OneHotEncoder(drop=None, categories=[edu_labels])),
        ]), ['Education (Real)']),
        ('age', Pipeline([  
            ('convert_age_real_values_to_human_readable_string', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_age)),
            ('one_hot_encoding_age', OneHotEncoder(drop=None, categories=[age_labels])),
        ]), ['Age (Real)']),
        ('gender', Pipeline(steps=[
            ('convert_gender_real_vals_to_binary', ConvertRealClassificationValuesToHumanReadableStrings(label_map=label_map_gender)),
        ]), ['Gender (Real)']),
        ('drug_classifications_to_users_pl', Pipeline(steps=[
            ('classification_to_num_val', ClassifierToNumericalValueTransformer(features_to_apply_to=drug_labels, classifications_kept=user_classifications))]),
        ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstacy', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'VSA']),
    ], remainder='passthrough')),
])
X_trans = pd.DataFrame(pipeline.fit_transform(X_train, y_train), columns=labels_remaining)
X_trans

Unnamed: 0,is_college_educated,is_high_school_dropout,<16 yrs old,16 yrs old,17 yrs old,18 yrs old,"Some college or uni., no cert. or degree",Prof. cert./diploma,Bachelor's,Master's,Ph.D.,18-24,25-34,35-44,45-54,55-64,65+,Gender (Real),Alcohol,Amphet,Amyl,Benzos,Caff,Cannabis,Choc,Coke,Crack,Ecstacy,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,VSA,Nscore (Real) [neuroticism],Escore (Real) [Extraversion],Oscore (Real) [Openness to experience],Ascore (Real) [Agreeableness],Cscore (Real) [Conscientiousness],Impulsive (Real),SS (Real) [sensation seeing (sic)]
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.05188,-1.23177,0.29338,1.11406,0.25953,0.88113,0.76540
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.52135,0.32197,0.72330,0.13136,1.13407,-0.21712,-0.52593
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.24649,0.00332,1.06238,0.59042,2.04506,-0.71126,-0.52593
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.05188,1.11406,0.88309,0.13136,-0.00665,0.52975,0.07987
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.28554,-1.23177,-0.01928,-2.53830,-2.90161,-0.21712,1.22470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.62967,-0.69509,0.14143,-1.21213,-0.27607,-0.21712,-0.21575
1497,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.34799,0.00332,0.58331,-0.15487,-0.52745,0.19268,1.22470
1498,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.04257,-0.15487,-0.17779,-0.60633,-0.40581,1.86203,1.22470
1499,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.13606,0.16767,-0.45174,0.43852,0.12331,-0.71126,0.40148


In [241]:
data = X_trans.copy()
from tensorflow import keras
import tensorflow as tf
tensor = tf.convert_to_tensor(data)

2022-05-02 18:53:22.383186: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [242]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=tensor[0].shape))

# Create the model using 2 hidden layers with 300 and 100 neurons using ReLU
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=tensor[0].shape, name="input"),
    keras.layers.Dense(300, activation=keras.activations.relu, name="hidden-1"),
    keras.layers.Dense(100, activation=keras.activations.relu, name="hidden-2"),
    keras.layers.Dense(10, activation=keras.activations.softmax, name="output")
])