### **Dataset Processing**

In [1]:
# IMPORTS
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA

In [2]:
# LOAD DATASET
df = pd.read_csv("drug_consumption.csv")
feature_file_dict = {f"{feat}":f"additional-variable-information/{i}-{feat}.csv" for i,feat in enumerate(df.columns.to_list())}
# Create dataframes to separate input & target features 
input_features = ['age','gender', 'education', 'country', 'ethnicity',
    'n_score', 'e_score', 'o_score', 'a_score', 'c_score', 
    'impulsive_bis11', 'sensation_seeking_impss']
target_features = ['alcohol', 'amphet', 'amyl', 'benzos', 'caff', 'cannabis', 
    'choc', 'coke', 'crack', 'ecstasy', 'heroin', 'ketamine', 'legalh', 'lsd', 
    'meth', 'mushrooms', 'nicotine', 'semer', 'vsa']
df_input = df[input_features].copy()
df_target = df[target_features].copy()
df_input = df_input.set_index(df['recordID'])
df_target = df_target.set_index(df['recordID'])
scaler = StandardScaler() # to normalize data for NN training 

#### **Target Features:** Drug/Substance Features

In [3]:
# GROUPING FEATURES INTO DRUG GROUPS 

# Using The Drugs Wheel (UK & Ireland DrugWatch) @ https://www.thedrugswheel.com/
dissociatives = ['ketamine']
depressants = ['alcohol', 'benzos']
opioids = ['heroin', 'meth']
cannabinoids = ['cannabis']
stimulants = ['amphet', 'caff', 'coke', 'crack', 'nicotine']
empathogens = ['ecstasy']
psychedelics = ['lsd', 'mushrooms']
# Made a group to include VSA & Amyl Nitrate
inhalants = ['amyl', 'vsa']
# NOTE: Chose to exclude the Chocolate & Legal Highs variables on basis of relevance, 
#       as team goal is to predict substance use for harm reduction planning/diagnostics

In [4]:
# MAP VALUES (EMPHASIZING RECENCY) TO DATASET 

# Using NON-LINEAR (exponential) scale to emphasize recency -> more relevant to harm reduction 
# (rather than focusing on past/experimental use) 
recency_map = {f'CL{i}': 2**i for i in range(7)}
# DataFrame to store weighted 'use' values and map scale to drug features, to be used in PCA 
weighted_consumption = pd.DataFrame()
drugs = dissociatives + depressants + opioids + cannabinoids + stimulants + empathogens + psychedelics + inhalants
for drug in drugs:
    weighted_consumption[f"{drug}"] = df_target[drug].map(recency_map)

In [5]:
# PRINCIPAL COMPONENT ANALYSIS

# Use PCA to reduce dimensionality by 'collapsing' drugs in the same group into a single feature
pca_groups = { # groups with more than one feature
    'depressants': depressants,
    'opioids': opioids,
    'stimulants': stimulants,
    'psychedelics': psychedelics,
    'inhalants': inhalants
}
for name, group in pca_groups.items():
    group_features = weighted_consumption[group]
    pca = PCA(n_components=1)
    df_target[f'{name}_pca'] = pca.fit_transform(group_features)
# Copy over the values from groups with only ONE feature
df_target['dissociatives'] = weighted_consumption['ketamine']
df_target['cannabinoids'] = weighted_consumption['cannabis']
df_target['empathogens'] = weighted_consumption['ecstasy']
# Standardize values across drug groups, including both PCA and single-drug ones
new_features = ['dissociatives', 'cannabinoids', 'empathogens', 'depressants_pca', 'opioids_pca', 'stimulants_pca', 'psychedelics_pca', 'inhalants_pca']
# New drug features are CENTERED and SCALED (mean=0, std=1), to train Neural Networks
df_target[new_features] = scaler.fit_transform(df_target[new_features])
# Remove pre-grouped drug features
df_target = df_target.drop(target_features, axis=1)

#### **Input Features:** Demographic & Personality Features

In [6]:
# GROUP FEATURES INTO INPUT VARIABLE TYPES

# Separate variables by grouping psychometric scores and classify categorical features by ordinal/nominal data dichotomy
ordinal_inputs = ['age', 'education']
nominal_inputs = ['gender', 'country', 'ethnicity']
scored_inputs = ['n_score', 'e_score', 'o_score', 'a_score', 'c_score']

In [7]:
# CONVERT INPUT FEATURE VALUES TO MEANINGS (FOR READABILITY)

# Convert input_df feature values with each variable value's corresponding meaning
for var_dem in ordinal_inputs+nominal_inputs:
    df_var = pd.read_csv(feature_file_dict[var_dem], usecols=['Value', 'Meaning'])
    value_meanings = pd.Series(df_var['Meaning'].values, index=df_var['Value']).to_dict()
    df_input[var_dem] = df_input[var_dem].map(value_meanings)
for score in scored_inputs:
    score_col = f"{score[0].capitalize()}score"
    df_score = pd.read_csv(feature_file_dict[score], usecols=[score_col, 'Value'])
    value_scores = pd.Series(df_score[score_col].values, index=df_score['Value']).to_dict()
    df_input[score] = df_input[score].map(value_scores)

In [8]:
# ENCODE & SCALE INPUT FEATURES

# Use sklearn's OrdinalEncoder for ordinal input features
ordinal_categories = [
    ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'],  # Age order
    ['Left school before 16 years', 'Left school at 16 years', 'Left school at 17 years',
     'Left school at 18 years', 'Some college/university without certificate/degree',
     'Professional certificate/diploma', 'University degree', 'Masters degree', 'Doctorate degree']  # Education order
]
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
df_input[ordinal_inputs] = ordinal_encoder.fit_transform(df_input[ordinal_inputs])
# Use one-hot encoding for nominal input features
df_input = pd.get_dummies(df_input, columns=nominal_inputs)
# Standardize psychometric scores by CENTERING and SCALING (mean=0, std=1)
df_input[scored_inputs+['impulsive_bis11', 'sensation_seeking_impss']] = scaler.fit_transform(df_input[scored_inputs+['impulsive_bis11', 'sensation_seeking_impss']])

#### **Save Processed Data**

In [9]:
# RECOMBINE INPUTS & TARGETS

df_processed = pd.concat([df_input, df_target], axis=1)
df_processed.to_csv("processed_drug_consumption.csv", index=False)