In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Sklearn functions and models
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

## Notebook that creates dataset with effect instead of substances

In this notebook we assigned a score for each drug based on the different effect that produce in the body. Then we multiply the column of the dataframe by this score and we compute the sum over group of substances. In this way we create a new column for each effect with a score for every sample in our dataframe.

We spot 3 effects: depressants ('alcohol', 'benzo', 'cannabis', 'ketamine', 'heroine'), hallucinogens ('benzo', 'cannabis', 'ketamine', 'heroine'), stimulants ('anphet', 'cocaine', 'ecstasy', 'meth'). 
([source](https://www.health.gov.au/health-topics/drugs/about-drugs/types-of-drugs))


In [3]:
#import data 
data = pd.read_csv('../data_processed/data_dummy_variable_etnicity_country_gender.csv')
data.drop(columns = ['Unnamed: 0', 'id_number'], axis = 1, inplace = True)
display(data)

Unnamed: 0,age,education,n_score,e_score,o_score,a_score,c_score,impulsiveness,ss,alcohol,...,white,other2,australia,canada,new_zealand,ireland,uk,usa,male,female
0,0.49788,-0.05921,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,5,...,0,0,0,0,0,0,1,0,0,1
1,-0.07854,1.98437,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,5,...,1,0,0,0,0,0,1,0,1,0
2,0.49788,-0.05921,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,-1.37983,0.40148,6,...,1,0,0,0,0,0,1,0,1,0
3,-0.95197,1.16365,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,4,...,1,0,0,0,0,0,1,0,0,1
4,0.49788,1.98437,0.73545,-1.63340,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,4,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,-0.95197,-0.61113,-1.19430,1.74091,1.88511,0.76096,-1.13788,0.88113,1.92173,5,...,1,0,0,0,0,0,0,1,0,1
1881,-0.95197,-0.61113,-0.24649,1.74091,0.58331,0.76096,-1.51840,0.88113,0.76540,5,...,1,0,0,0,0,0,0,1,1,0
1882,-0.07854,0.45468,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,0.52975,-0.52593,4,...,1,0,0,0,0,0,0,1,0,1
1883,-0.95197,-0.61113,0.91093,-1.92173,0.29338,-1.62090,-2.57309,1.29221,1.22470,5,...,1,0,0,0,0,0,0,1,0,1


# Strategy
Create a function to convert name_substances in a column to append to the dataframe

In [13]:
#create name_substances

depressants =zip(['alcohol', 'benzo', 'cannabis', 'ketamine', 'heroine'],[0.08, 0.24, 0.08,0.18, 0.42])
depressants= list(depressants)
#column_depressant = column_group(depressants, data)
#print(column_depressant)
depressants_no_wine =zip(['benzo', 'cannabis', 'ketamine', 'heroine'],[0.26, 0.06,0.21, 0.47])
depressants_no_wine= list(depressants_no_wine)

hallucinogens = zip(['cannabis', 'ketamine', 'lsd', 'mushrooms'], [0.09, 0.3, 0.36,0.61])
hallucinogens = list(hallucinogens)
#column_hallucinogens = column_group(hallucinogens, data)

#print(column_hallucinogens)

stimulants = zip(['anphet', 'cocaine', 'ecstasy', 'meth'], [0.2, 0.3,  0.2, 0.3])
stimulants = list(stimulants)
#column_stimulants = column_group(stimulants, data)

#print(column_stimulants)



In [7]:
sum([0.26, 0.06,0.21, 0.47])
#0.41/0.87

1.0

In [9]:
# function to covert name_substances to a np.array to append to the dataset
# funziona

def column_group(name_substances, data):
    '''
    input
    name_substances : list of tuples. Each tuple is made by 2 elements, the first one is the name of the 
                    substance and the second is the score
    data : pandas dataframe that has substances as column. Substances are the first element of each 
           item of name_substances; to acces them: [name_substances[i][0] for i in range(len(name_substances))] 
    
    output
    column : numpy array with the same lenght of data that for each element compute the sum over the 
             substances coulumns each multiplied by the value assigned for the substances. ([name_substances[i][1] for i in range(len(name_substances))])
    '''
    
    
    z = []
    for substance in name_substances:
         #print(data[substance[0]])
         z.append((data[substance[0]] * substance[1]))
         #print(z)
    #print(depressant[0])
    #print(depressant[1])
    column = np.zeros(len(z[0]))
    for i in range(len(z)):
        column = np.add(column, z[i])
    
    return column
    
column_depressant = column_group(depressants, data)
column_stimuants = column_group(stimulants, data)
column_hallucinogens = column_group(hallucinogens, data)
column_depressants_no_wine = column_group(depressants_no_wine, data)

# create a new dataframe to append
print(column_depressants_no_wine)

0       0.52
1       0.66
2       0.18
3       1.32
4       0.18
        ... 
1880    0.30
1881    0.18
1882    2.08
1883    0.36
1884    0.96
Length: 1885, dtype: float64


In [10]:
# create the dataframe to export

dictionary_dataframe = {'depressant': column_depressant, 
                        'depressant_no_wine': column_depressants_no_wine,
                        'stimulants': column_stimuants, 
                        'hallucinogens' : column_hallucinogens}
df_to_append = pd.DataFrame(data = dictionary_dataframe )
display(df_to_append)

new_df = data.copy()

#drop the substances
substances = ['alcohol','anphet', 'amyl', 'benzo', 'caffeine', 'cannabis', 
              'chocolate', 'cocaine', 'crack', 'ecstasy', 'heroine', 'ketamine', 
              'legal_h', 'lsd', 'meth', 'mushrooms', 'nicotine', 'semer', 'vsa']

new_df.drop(columns = substances, inplace = True) #remove the substances
display(new_df)
#sns.correlation()

final_df = pd.concat([new_df, df_to_append],axis=1)
display(final_df)
#export the df

final_df.to_csv('.\data_processed\data_group_by_different_effects.csv')

Unnamed: 0,depressant,depressant_no_wine,stimulants,hallucinogens
0,0.88,0.52,0.4,0.00
1,1.08,0.66,3.0,1.68
2,0.72,0.18,0.0,0.88
3,1.56,1.32,0.6,0.78
4,0.56,0.18,0.4,1.49
...,...,...,...,...
1880,0.80,0.30,0.0,1.53
1881,0.64,0.18,1.6,4.51
1882,2.36,2.08,3.2,3.08
1883,0.88,0.36,0.6,3.45


Unnamed: 0,age,education,n_score,e_score,o_score,a_score,c_score,impulsiveness,ss,asian,...,white,other2,australia,canada,new_zealand,ireland,uk,usa,male,female
0,0.49788,-0.05921,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,0,...,0,0,0,0,0,0,1,0,0,1
1,-0.07854,1.98437,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,0,...,1,0,0,0,0,0,1,0,1,0
2,0.49788,-0.05921,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,-1.37983,0.40148,0,...,1,0,0,0,0,0,1,0,1,0
3,-0.95197,1.16365,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,0,...,1,0,0,0,0,0,1,0,0,1
4,0.49788,1.98437,0.73545,-1.63340,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,0,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,-0.95197,-0.61113,-1.19430,1.74091,1.88511,0.76096,-1.13788,0.88113,1.92173,0,...,1,0,0,0,0,0,0,1,0,1
1881,-0.95197,-0.61113,-0.24649,1.74091,0.58331,0.76096,-1.51840,0.88113,0.76540,0,...,1,0,0,0,0,0,0,1,1,0
1882,-0.07854,0.45468,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,0.52975,-0.52593,0,...,1,0,0,0,0,0,0,1,0,1
1883,-0.95197,-0.61113,0.91093,-1.92173,0.29338,-1.62090,-2.57309,1.29221,1.22470,0,...,1,0,0,0,0,0,0,1,0,1


Unnamed: 0,age,education,n_score,e_score,o_score,a_score,c_score,impulsiveness,ss,asian,...,new_zealand,ireland,uk,usa,male,female,depressant,depressant_no_wine,stimulants,hallucinogens
0,0.49788,-0.05921,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,0,...,0,0,1,0,0,1,0.88,0.52,0.4,0.00
1,-0.07854,1.98437,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,0,...,0,0,1,0,1,0,1.08,0.66,3.0,1.68
2,0.49788,-0.05921,-0.46725,0.80523,-0.84732,-1.62090,-1.01450,-1.37983,0.40148,0,...,0,0,1,0,1,0,0.72,0.18,0.0,0.88
3,-0.95197,1.16365,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,0,...,0,0,1,0,0,1,1.56,1.32,0.6,0.78
4,0.49788,1.98437,0.73545,-1.63340,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,0,...,0,0,1,0,0,1,0.56,0.18,0.4,1.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,-0.95197,-0.61113,-1.19430,1.74091,1.88511,0.76096,-1.13788,0.88113,1.92173,0,...,0,0,0,1,0,1,0.80,0.30,0.0,1.53
1881,-0.95197,-0.61113,-0.24649,1.74091,0.58331,0.76096,-1.51840,0.88113,0.76540,0,...,0,0,0,1,1,0,0.64,0.18,1.6,4.51
1882,-0.07854,0.45468,1.13281,-1.37639,-1.27553,-1.77200,-1.38502,0.52975,-0.52593,0,...,0,0,0,1,0,1,2.36,2.08,3.2,3.08
1883,-0.95197,-0.61113,0.91093,-1.92173,0.29338,-1.62090,-2.57309,1.29221,1.22470,0,...,0,0,0,1,0,1,0.88,0.36,0.6,3.45


In [11]:
df_to_append['difference'] = df_to_append['depressant'] - df_to_append['depressant_no_wine']
df_to_append
#data['alcohol'].value_counts()

Unnamed: 0,depressant,depressant_no_wine,stimulants,hallucinogens,difference
0,0.88,0.52,0.4,0.00,0.36
1,1.08,0.66,3.0,1.68,0.42
2,0.72,0.18,0.0,0.88,0.54
3,1.56,1.32,0.6,0.78,0.24
4,0.56,0.18,0.4,1.49,0.38
...,...,...,...,...,...
1880,0.80,0.30,0.0,1.53,0.50
1881,0.64,0.18,1.6,4.51,0.46
1882,2.36,2.08,3.2,3.08,0.28
1883,0.88,0.36,0.6,3.45,0.52
