# Cleaning

In [21]:
import numpy as np
import pandas as pd
data = pd.read_csv("import.csv")

In [22]:
data = data.drop(columns="Unnamed: 0")
adult = data.loc[data['adult_depression'] == 1.0]

In [23]:
# cleaning adult df
adult_clean = adult[(adult.adult_think != 97)&(adult.adult_plan != 97)&(adult.adult_attempt != 97)&(adult.adult_attempt != 94)]
adult_clean.loc[adult_clean['adult_think'] == 2, 'adult_think'] = 0
adult_clean.loc[adult_clean['adult_plan'] == 2, 'adult_plan'] = 0
adult_clean.loc[adult_clean['adult_plan'] == 99, 'adult_plan'] = 0
adult_clean.loc[adult_clean['adult_attempt'] == 2, 'adult_attempt'] = 0
adult_clean.loc[adult_clean['adult_attempt'] == 99, 'adult_attempt'] = 0
adult_clean['adult_think'].value_counts()

0    1992
1    1939
Name: adult_think, dtype: int64

In [24]:
adult_clean['adult_plan'].value_counts()

0    3173
1     758
Name: adult_plan, dtype: int64

In [25]:
adult_clean['adult_attempt'].value_counts()

0    3456
1     475
Name: adult_attempt, dtype: int64

In [26]:
adult_clean = adult_clean.drop(columns=['youth_think', 'youth_plan','youth_attempt','adult_depression','youth_depression'])
adult_clean = adult_clean.rename(columns={"adult_think": "think", "adult_plan": "plan", "adult_attempt": "attempt"})

In [27]:
# cleaning adolescent df (answer 97 or 94)
adolescent = data.loc[data['youth_depression'] == 1.0]
adolescent_clean = adolescent[(adolescent.youth_think != 97)&(adolescent.youth_think != 94)&
                             (adolescent.youth_plan != 97)&(adolescent.youth_plan != 94)&
                             (adolescent.youth_attempt != 97)]

In [28]:
adolescent_clean.loc[adolescent_clean['marital_status'] == 99, 'marital_status'] = 4
adolescent_clean.loc[adolescent_clean['youth_think'] == 2, 'youth_think'] = 0
adolescent_clean.loc[adolescent_clean['youth_plan'] == 2, 'youth_plan'] = 0
adolescent_clean.loc[adolescent_clean['youth_plan'] == 99, 'youth_plan'] = 0
adolescent_clean.loc[adolescent_clean['youth_attempt'] == 2, 'youth_attempt'] = 0
adolescent_clean.loc[adolescent_clean['youth_attempt'] == 99, 'youth_attempt'] = 0
adolescent_clean['youth_think'].value_counts()

1    1193
0     631
Name: youth_think, dtype: int64

In [29]:
adolescent_clean['youth_plan'].value_counts()

0    1214
1     610
Name: youth_plan, dtype: int64

In [30]:
adolescent_clean['youth_attempt'].value_counts()

0    1401
1     423
Name: youth_attempt, dtype: int64

In [31]:
adolescent_clean = adolescent_clean.drop(columns=['adult_think', 'adult_plan','adult_attempt','adult_depression','youth_depression'])
adolescent_clean = adolescent_clean.rename(columns={"youth_think": "think", "youth_plan": "plan", "youth_attempt": "attempt"})

In [32]:
# combine adult and adolescent
df = pd.concat([adolescent_clean,adult_clean])
df.loc[df['gender'] == 2, 'gender'] = 0
df.to_csv("CleanData.csv")

# Demographics

In [75]:
df["age_group"].value_counts()/5755

2    0.318679
1    0.316942
3    0.146829
4    0.141964
5    0.075586
Name: age_group, dtype: float64

In [76]:
df["gender"].value_counts()/5755

0    0.674891
1    0.325109
Name: gender, dtype: float64

In [77]:
df["education"].value_counts()/5755

5    0.316942
3    0.288097
2    0.169418
4    0.152911
1    0.072632
Name: education, dtype: float64

In [78]:
df["total_fam_income"].value_counts()/5755

7    0.324414
6    0.153953
2    0.122502
3    0.110165
4    0.100608
5    0.098002
1    0.090356
Name: total_fam_income, dtype: float64

In [79]:
df["marital_status"].value_counts()/5755

4    0.732233
1    0.169939
3    0.083927
2    0.013901
Name: marital_status, dtype: float64

# Substance use disorder

In [84]:
df.loc[:,'nicotine':'psychotherapeutic'].sum()/5755

nicotine             0.119896
alcohol              0.082884
marijuana            0.052997
cocaine              0.008862
heroin               0.006255
hallucinogen         0.003301
inhalant             0.001216
methamphetamine      0.013206
pain_reliever        0.018245
tranquilizer         0.010773
stimulant            0.011816
psychotherapeutic    0.033884
dtype: float64

In [19]:
df_group = df.groupby(['age_group']).sum()
df_group_drug = df_group.loc[:,'nicotine':'psychotherapeutic']
df_group_drug

Unnamed: 0_level_0,nicotine,alcohol,marijuana,cocaine,heroin,hallucinogen,inhalant,methamphetamine,pain_reliever,tranquilizer,stimulant,psychotherapeutic
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,35,47,60,1,1,3,3,1,23,10,16,37
2,197,199,172,23,15,7,2,24,32,27,29,73
3,176,102,49,13,13,4,1,21,22,10,11,40
4,196,93,22,10,6,4,0,23,17,10,9,32
5,86,36,2,4,1,1,1,7,11,5,3,13


In [30]:
# drug use percentage - adolescents
adolescent_drug = adolescent_clean.loc[:,'nicotine':'psychotherapeutic']
adolescent_drug.sum()

nicotine             35
alcohol              47
marijuana            60
cocaine               1
heroin                1
hallucinogen          3
inhalant              3
methamphetamine       1
pain_reliever        23
tranquilizer         10
stimulant            16
psychotherapeutic    37
dtype: int64

In [34]:
adolescent_drug.sum()/adolescent_drug.count()

nicotine             0.019189
alcohol              0.025768
marijuana            0.032895
cocaine              0.000548
heroin               0.000548
hallucinogen         0.001645
inhalant             0.001645
methamphetamine      0.000548
pain_reliever        0.012610
tranquilizer         0.005482
stimulant            0.008772
psychotherapeutic    0.020285
dtype: float64

In [36]:
adolescent_drug.count()

nicotine             1824
alcohol              1824
marijuana            1824
cocaine              1824
heroin               1824
hallucinogen         1824
inhalant             1824
methamphetamine      1824
pain_reliever        1824
tranquilizer         1824
stimulant            1824
psychotherapeutic    1824
dtype: int64

In [21]:
# drug use percentage - adults
adult_drug= adult_clean.loc[:,'nicotine':'psychotherapeutic']
adult_drug.sum()/adult_drug.count()

nicotine             0.166624
alcohol              0.109387
marijuana            0.062325
cocaine              0.012719
heroin               0.008904
hallucinogen         0.004070
inhalant             0.001018
methamphetamine      0.019079
pain_reliever        0.020860
tranquilizer         0.013228
stimulant            0.013228
psychotherapeutic    0.040193
dtype: float64

In [23]:
# drug use percentage - adults (18-25)

age_group_2 = dict(tuple(adult_clean.groupby('age_group')))[2]
age_group_2_drug = age_group_2.loc[:,'nicotine':'psychotherapeutic']
age_group_2_drug.sum()/age_group_2_drug.count()

nicotine             0.107415
alcohol              0.108506
marijuana            0.093784
cocaine              0.012541
heroin               0.008179
hallucinogen         0.003817
inhalant             0.001091
methamphetamine      0.013086
pain_reliever        0.017448
tranquilizer         0.014722
stimulant            0.015812
psychotherapeutic    0.039804
dtype: float64

In [24]:
# drug use percentage - adults (26-34)

age_group_3 = dict(tuple(adult_clean.groupby('age_group')))[3]
age_group_3_drug = age_group_3.loc[:,'nicotine':'psychotherapeutic']
age_group_3_drug.sum()/age_group_3_drug.count()

nicotine             0.208284
alcohol              0.120710
marijuana            0.057988
cocaine              0.015385
heroin               0.015385
hallucinogen         0.004734
inhalant             0.001183
methamphetamine      0.024852
pain_reliever        0.026036
tranquilizer         0.011834
stimulant            0.013018
psychotherapeutic    0.047337
dtype: float64

In [25]:
# drug use percentage - adults (35-49)

age_group_4 = dict(tuple(adult_clean.groupby('age_group')))[4]
age_group_4_drug = age_group_4.loc[:,'nicotine':'psychotherapeutic']
age_group_4_drug.sum()/age_group_4_drug.count()

nicotine             0.239902
alcohol              0.113831
marijuana            0.026928
cocaine              0.012240
heroin               0.007344
hallucinogen         0.004896
inhalant             0.000000
methamphetamine      0.028152
pain_reliever        0.020808
tranquilizer         0.012240
stimulant            0.011016
psychotherapeutic    0.039168
dtype: float64

In [26]:
# drug use percentage - adults (50+)

age_group_5 = dict(tuple(adult_clean.groupby('age_group')))[5]
age_group_5_drug = age_group_5.loc[:,'nicotine':'psychotherapeutic']
age_group_5_drug.sum()/age_group_5_drug.count()

nicotine             0.197701
alcohol              0.082759
marijuana            0.004598
cocaine              0.009195
heroin               0.002299
hallucinogen         0.002299
inhalant             0.002299
methamphetamine      0.016092
pain_reliever        0.025287
tranquilizer         0.011494
stimulant            0.006897
psychotherapeutic    0.029885
dtype: float64

# Suicidal behavior

In [45]:
df_group_suicide = df_group.loc[:,'think':'attempt']
df_group_suicide

Unnamed: 0_level_0,think,plan,attempt
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1193,610,423
2,1040,440,275
3,397,142,94
4,345,120,74
5,157,56,32


In [46]:
adolescent_suicide = adolescent_clean.loc[:,'think':'attempt']
adolescent_suicide.sum()

think      1193
plan        610
attempt     423
dtype: int64

In [47]:
adolescent_suicide.sum()/adolescent_suicide.count()

think      0.654057
plan       0.334430
attempt    0.231908
dtype: float64

In [85]:
adult_suicide = adult_clean.loc[:,'think':'attempt']
adult_suicide.sum()/adult_suicide.count()

think      0.493259
plan       0.192826
attempt    0.120834
dtype: float64

In [60]:
age_group_2_suicide = age_group_2.loc[:,'think':'attempt']
age_group_2_suicide.sum()/age_group_2_suicide.count()

think      0.567067
plan       0.239913
attempt    0.149945
dtype: float64

In [59]:
age_group_3_suicide = age_group_3.loc[:,'think':'attempt']
age_group_3_suicide.sum()/age_group_3_suicide.count()

think      0.469822
plan       0.168047
attempt    0.111243
dtype: float64

In [58]:
age_group_4_suicide = age_group_4.loc[:,'think':'attempt']
age_group_4_suicide.sum()/age_group_4_suicide.count()

think      0.422277
plan       0.146879
attempt    0.090575
dtype: float64

In [61]:
age_group_5_suicide = age_group_5.loc[:,'think':'attempt']
age_group_5_suicide.sum()/age_group_5_suicide.count()

think      0.360920
plan       0.128736
attempt    0.073563
dtype: float64