In [25]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [22]:
df = pd.read_csv("drug_consumption.csv")

# Using The Drugs Wheel (UK & Ireland DrugWatch) @ https://www.thedrugswheel.com/
dissociatives = ['ketamine']
depressants = ['alcohol', 'benzos']
opioids = ['heroin', 'meth']
cannabinoids = ['cannabis']
stimulants = ['amphet', 'caff', 'coke', 'crack', 'nicotine']
empathogens = ['ecstasy']
psychedelics = ['lsd', 'mushrooms']

# seperate to include VSA & Amyl Nitrate
inhalants = ['amyl', 'vsa']

# EXCLUDE Chocolate & Legal Highs, goal is to predict for potential harm reduction planning/diagnostics

In [23]:
# Using NON-LINEAR (exponential) scale to emphasize recency of use, 
# as its more relevant for immediate harm reduction or diagnostics 
# (rather than focusing on past/experimental use) 
recency_map = {f'CL{i}': 2**i for i in range(7)}

weighted_consumption = pd.DataFrame()
drugs = dissociatives + depressants + opioids + cannabinoids + stimulants + empathogens + psychedelics + inhalants

for drug in drugs:
    weighted_consumption[f"{drug}"] = df[drug].map(recency_map)

In [None]:
# Principal Component Analysis (PCA) to create new features for the drug classes
pca_groups = {
    'depressants': depressants,
    'opioids': opioids,
    'stimulants': stimulants,
    'psychedelics': psychedelics,
    'inhalants': inhalants
}

for name, group in pca_groups.items():
    group_features = weighted_consumption[group]
    pca = PCA(n_components=1)
    df[f'{name}_pca'] = pca.fit_transform(group_features)

df['dissociatives'] = weighted_consumption['ketamine']
df['cannabinoids'] = weighted_consumption['cannabis']
df['empathogens'] = weighted_consumption['ecstasy']

# Standardize values across drug groups, including both PCA and single-drug ones
new_features = ['dissociatives', 'cannabinoids', 'empathogens', 'depressants_pca', 'opioids_pca', 'stimulants_pca', 'psychedelics_pca', 'inhalants_pca']
scaler = StandardScaler()
# New drug features are CENTERED and SCALED (mean=0, std=1), to train Neural Networks
df[new_features] = scaler.fit_transform(df[new_features])
df

Unnamed: 0,recordID,age,gender,education,country,ethnicity,n_score,e_score,o_score,a_score,...,semer,vsa,depressants_pca,opioids_pca,stimulants_pca,psychedelics_pca,inhalants_pca,dissociatives,cannabinoids,empathogens
0,1,0.49788,0.48246,-0.05921,0.96082,0.12600,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,-0.067435,-0.342814,-0.729728,-0.516843,-0.289419,-0.310983,-0.819128,-0.485586
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL0,CL0,-0.063631,0.181870,-0.291702,-0.229896,-0.120854,0.225342,-0.232570,1.154790
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.62090,...,CL0,CL0,1.472684,-0.342814,-0.848886,-0.456192,-0.289419,-0.310983,-0.545401,-0.485586
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,-0.840666,-0.342814,-0.908173,-0.516843,-0.289419,0.225342,-0.701816,-0.485586
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.63340,-0.45174,-0.30172,...,CL0,CL0,-0.831788,-0.342814,-0.739714,-0.334889,-0.233231,-0.310983,-0.545401,-0.376228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,1884,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,-1.19430,1.74091,1.88511,0.76096,...,CL0,CL5,-0.063631,-0.342814,-1.107785,0.152699,5.644934,-0.310983,0.393093,-0.485586
1881,1885,-0.95197,-0.48246,-0.61113,-0.57009,-0.31685,-0.24649,1.74091,0.58331,0.76096,...,CL0,CL0,-0.063631,0.781508,0.055031,3.358038,-0.289419,-0.310983,-0.545401,-0.157511
1882,1886,-0.07854,0.48246,0.45468,-0.57009,-0.31685,1.13281,-1.37639,-1.27553,-1.77200,...,CL0,CL0,-0.871103,-0.342814,1.699095,-0.047943,1.452417,0.225342,1.644417,1.154790
1883,1887,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.91093,-1.92173,0.29338,-1.62090,...,CL0,CL0,-0.063631,-0.342814,-0.327991,0.577257,-0.289419,-0.310983,1.644417,0.279923
