## Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filename="marijuana_data.csv"
df = pd.read_csv(filename, index_col=0)
df.head()

Unnamed: 0,ID,GENDER,AGE,MARRIAGE,PREGNANCY,INCOME,INCOME_RATIO,HOUSEHOLD_NUM,OVER_60_NUM,EVER_USE,FIRST_USAGE,FREQUENCY,PER_MONTH,PER_YEAR,LAST_ONCE_MONTH,AMOUNT,LAST_USAGE,PAST_MONTH_USAGE,CHILDREN_NUM,EDUCATION
0,83732.0,1.0,62.0,1.0,,10.0,4.39,2.0,1.0,,,,,,,,,,1.0795209999999999e-78,5.0
1,83733.0,1.0,53.0,3.0,,4.0,1.32,1.0,5.397605e-79,2.0,,,,,,,,,1.0795209999999999e-78,3.0
2,83735.0,2.0,56.0,6.0,,10.0,5.0,1.0,5.397605e-79,2.0,,,,,,,,,1.0795209999999999e-78,5.0
3,83736.0,2.0,42.0,3.0,1.0,7.0,1.23,5.0,5.397605e-79,1.0,25.0,5.0,1.0,25.0,5.397605e-79,4.0,1.0,30.0,2.0,4.0
4,83741.0,1.0,22.0,5.0,,7.0,2.08,3.0,5.397605e-79,1.0,15.0,4.0,1.0,16.0,1.0,2.0,1.0,25.0,1.0795209999999999e-78,4.0


In [3]:
# sanity check
df.isnull().sum()

ID                     0
GENDER                 0
AGE                    0
MARRIAGE             261
PREGNANCY           3603
INCOME               168
INCOME_RATIO         466
HOUSEHOLD_NUM          0
OVER_60_NUM            0
EVER_USE            1415
FIRST_USAGE         3128
FREQUENCY           3973
PER_MONTH           3128
PER_YEAR            3973
LAST_ONCE_MONTH     3979
AMOUNT              3973
LAST_USAGE          3130
PAST_MONTH_USAGE    4297
CHILDREN_NUM           0
EDUCATION              0
dtype: int64

In [4]:
#functions for converting from numerical to categorical data
def num_to_category(num_val, data_dict):
    if num_val in data_dict.keys():
        return data_dict[num_val]
    else:
        return np.nan

# preserves range values and assigns 
def num_range_to_category(num_val, start, stop, data_dict):
    if num_val in range(start,stop):
        return str(num_val);
    else:
        return num_to_category(num_val, data_dict)
        

# GENDER
gender_dict = {1:'male',2:'female'}
# MARRIAGE
marital_status_dict = {1: 'married', 2:'widowed', 3: 'divorced', 4: 'seperated', 5: 'never_married', 6: 'with_partner', 77:'refused', 99: "unknown"}
# PREGNANCY
pregnancy_dict = {1: 'yes', 2: 'no', 3: 'cannot_ascertain'}
# EDUCATION
education_dict = {1:'<9th_grade', 2: '9-11th_grade', 3: 'HighSchool/GED', 4: 'AA', 5: 'college_graduate',7: 'refused', 9: "unknown"}
# DUQ200, DUQ211
DUQ_dict = {1: 'yes', 2: 'no', 7: 'refused', 9: "unknown"}
# DUQ210, DUQ213
age_partial_dict = {6: '<6_years', 777: 'refused', 999: "unknown"}
# DUQ217
frequency_dict = {1: 'once_month', 2:'2-3_month', 3: '4-8_month', 4:'9-24_month', 5: '25-30_month', 7: 'refused', 9: 'unknown'}
# DUQ219
amount_dict = {1: '1_per_day', 2: '2_per_day', 3: '3-5_per_day', 4: '>6_per_day', 7: 'refused', 9: 'unknown'}
# DUQ215Q, DUQ220Q, DUQ230,
time_partial_dict = {7777: 'refused', 9999: 'unknown'}

In [5]:
df['GENDER']=df['GENDER'].apply(num_to_category, data_dict=gender_dict)
#df['GENDER']

In [6]:
df['MARRIAGE']=df['MARRIAGE'].apply(num_to_category, data_dict=marital_status_dict)
#df['MARRIAGE']

In [7]:
df['EDUCATION']=df['EDUCATION'].apply(num_to_category, data_dict=education_dict)
#df['EDUCATION']

In [8]:
df['PREGNANCY']=df['PREGNANCY'].apply(num_to_category, data_dict=pregnancy_dict)

In [9]:
# DUQ200
df['EVER_USE']=df['EVER_USE'].apply(num_to_category, data_dict=DUQ_dict)
#df['EVER_USE']

In [10]:
# DUQ211
df['PER_MONTH']=df['PER_MONTH'].apply(num_to_category, data_dict=DUQ_dict)
#df['PER_MONTH']

In [11]:
# DUQ210
df['FIRST_USAGE']=df['FIRST_USAGE'].apply(num_range_to_category,start=7,stop=50,data_dict=age_partial_dict)
#df['FIRST_USAGE']

In [12]:
# DUQ213
df['PER_YEAR']=df['PER_YEAR'].apply(num_range_to_category,start=7,stop=50,data_dict=age_partial_dict)
#df['PER_YEAR']

In [13]:
# DUQ210
df['FIRST_USAGE']=df['FIRST_USAGE'].apply(num_range_to_category,start=7,stop=50,data_dict=age_partial_dict)
#df['FIRST_USAGE']

In [14]:
# DUQ217
df['FREQUENCY']=df['FREQUENCY'].apply(num_to_category, data_dict = frequency_dict)
#df['FREQUENCY']

In [15]:
# DUQ215Q
df['LAST_ONCE_MONTH']=df['LAST_ONCE_MONTH'].apply(num_range_to_category,start=0,stop=365,data_dict=time_partial_dict)
#df['LAST_ONCE_MONTH']

In [16]:
# DUQ219
df['AMOUNT']=df['AMOUNT'].apply(num_to_category, data_dict = amount_dict)
#df['AMOUNT']

In [17]:
# DUQ220
df['LAST_USAGE']=df['LAST_USAGE'].apply(num_range_to_category,start=0,stop=369,data_dict=time_partial_dict)
#df['LAST_USAGE']

In [18]:
# DUQ230
df['PAST_MONTH_USAGE']=df['PAST_MONTH_USAGE'].apply(num_range_to_category,start=0,stop=369,data_dict=time_partial_dict)
#df['PAST_MONTH_USAGE']

In [19]:
df

Unnamed: 0,ID,GENDER,AGE,MARRIAGE,PREGNANCY,INCOME,INCOME_RATIO,HOUSEHOLD_NUM,OVER_60_NUM,EVER_USE,FIRST_USAGE,FREQUENCY,PER_MONTH,PER_YEAR,LAST_ONCE_MONTH,AMOUNT,LAST_USAGE,PAST_MONTH_USAGE,CHILDREN_NUM,EDUCATION
0,83732.0,male,62.0,married,,10.0,4.390000e+00,2.0,1.000000e+00,,,,,,,,,,1.079521e-78,college_graduate
1,83733.0,male,53.0,divorced,,4.0,1.320000e+00,1.0,5.397605e-79,no,,,,,,,,,1.079521e-78,HighSchool/GED
2,83735.0,female,56.0,with_partner,,10.0,5.000000e+00,1.0,5.397605e-79,no,,,,,,,,,1.079521e-78,college_graduate
3,83736.0,female,42.0,divorced,yes,7.0,1.230000e+00,5.0,5.397605e-79,yes,,25-30_month,yes,25.0,,>6_per_day,1.0,30.0,2.000000e+00,AA
4,83741.0,male,22.0,never_married,,7.0,2.080000e+00,3.0,5.397605e-79,yes,,9-24_month,yes,16.0,1.0,2_per_day,1.0,25.0,1.079521e-78,AA
5,83742.0,female,32.0,married,no,6.0,1.030000e+00,4.0,5.397605e-79,yes,,,no,,,,13.0,,2.000000e+00,AA
6,83743.0,male,18.0,,,15.0,5.000000e+00,3.0,5.397605e-79,no,,,,,,,,,1.079521e-78,
7,83744.0,male,56.0,divorced,,3.0,1.190000e+00,1.0,5.397605e-79,yes,,,no,,,,20.0,,1.079521e-78,HighSchool/GED
8,83747.0,male,46.0,with_partner,,3.0,7.500000e-01,2.0,5.397605e-79,no,,,,,,,,,1.079521e-78,college_graduate
9,83750.0,male,45.0,never_married,,10.0,1.360000e+00,5.0,5.397605e-79,yes,,,no,,,,20.0,1.0,2.000000e+00,9-11th_grade


In [20]:
df.isnull().sum()

ID                     0
GENDER                 0
AGE                    0
MARRIAGE             261
PREGNANCY           3603
INCOME               168
INCOME_RATIO         466
HOUSEHOLD_NUM          0
OVER_60_NUM            0
EVER_USE            1415
FIRST_USAGE         4843
FREQUENCY           3973
PER_MONTH           3128
PER_YEAR            3974
LAST_ONCE_MONTH     4015
AMOUNT              3973
LAST_USAGE          3154
PAST_MONTH_USAGE    4299
CHILDREN_NUM           0
EDUCATION            259
dtype: int64