# Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load and create dataframe to pre-process

In [2]:
df = pd.read_csv('drug_consumption_pp.csv', index_col=0)

In [3]:
data = df.drop(columns=['ID'])

### Create dummy variables to categorical features

In [4]:
idx_alcohol = data.columns.get_loc('Alcohol')
idx_vsa = data.columns.get_loc('VSA')

drugs = list(data.iloc[:,idx_alcohol:idx_vsa+1].columns)
print(drugs)

['Alcohol', 'Amphetamines', 'Amyl', 'Benzos', 'Caffeine', 'Cannabis', 'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs', 'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']


In [5]:
data['Gender'] = data['Gender'].replace(('Female','Male'), (0,1))
data[drugs] = data[drugs].replace(('Non-user','User'), (0,1))

pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,Age,Gender,Education,Country,Neuroticism,Extraversion,Openness_to_Experience,Agreeableness,Conscientiousness,Impulsiveness,Sensation-Seeking,Alcohol,Amphetamines,Amyl,Benzos,Caffeine,Cannabis,Chocolate,Cocaine,Crack,Ecstasy,Heroin,Ketamine,Legal Highs,LSD,Methadone,Mushrooms,Nicotine,Semer,VSA
0,35-44,0,Professional certificate/diploma,UK,39.0,36.0,42.0,37.0,42.0,-0.21712,-1.18084,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,25-34,1,Doctorate,UK,29.0,52.0,55.0,48.0,41.0,-0.71126,-0.21575,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,1,0,0
2,35-44,1,Professional certificate/diploma,UK,31.0,45.0,40.0,32.0,34.0,-1.37983,0.40148,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,18-24,0,Masters,UK,34.0,34.0,46.0,47.0,46.0,-1.37983,-1.18084,1,0,0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0
4,35-44,0,Doctorate,UK,43.0,28.0,43.0,41.0,50.0,-0.21712,-0.21575,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0


In [6]:
dfo = data.select_dtypes(include='object')
data = pd.concat([data.drop(dfo, axis=1), pd.get_dummies(dfo, drop_first=True)], axis=1)
data.head()

Unnamed: 0,Gender,Neuroticism,Extraversion,Openness_to_Experience,Agreeableness,Conscientiousness,Impulsiveness,Sensation-Seeking,Alcohol,Amphetamines,Amyl,Benzos,Caffeine,Cannabis,Chocolate,Cocaine,Crack,Ecstasy,Heroin,Ketamine,Legal Highs,LSD,Methadone,Mushrooms,Nicotine,Semer,VSA,Age_25-34,Age_35-44,Age_45-54,Age_55-64,Age_65+,Education_Doctorate,Education_Left school at 16,Education_Left school at 17,Education_Left school at 18,Education_Left school before 16,Education_Masters,Education_Professional certificate/diploma,Education_Some college/university,Country_Canada,Country_New Zealand,Country_Other,Country_Republic of Ireland,Country_UK,Country_USA
0,0,39.0,36.0,42.0,37.0,42.0,-0.21712,-1.18084,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,1,29.0,52.0,55.0,48.0,41.0,-0.71126,-0.21575,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,31.0,45.0,40.0,32.0,34.0,-1.37983,0.40148,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,0,34.0,34.0,46.0,47.0,46.0,-1.37983,-1.18084,1,0,0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,0,43.0,28.0,43.0,41.0,50.0,-0.21712,-0.21575,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


### Standardize and split data into test/training sets

In [7]:
X = data.drop(columns=['Nicotine'])
y = data.Nicotine

SS = StandardScaler()
X_scaled = SS.fit_transform(X)
y = y.ravel()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)