In [140]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression



In [141]:
cols = ['Age', 
        'Gender',
        'Country',
        'self_employed',
        'family_history',
        'treatment',
        'work_interfere', 
        'no_employees',
        'remote_work',
        'tech_company',
        'benefits',
        'care_options',
        'wellness_program',
        'seek_help',
        'leave',
        'coworkers',
        'mental_vs_physical', 
        'mental_health_consequence',
        'obs_consequence']

unprocessed_data = pd.read_csv('survey.csv', usecols=cols)

In [142]:
unprocessed_data.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,leave,mental_health_consequence,coworkers,mental_vs_physical,obs_consequence
0,37,Female,United States,,No,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Somewhat easy,No,Some of them,Yes,No
1,44,M,United States,,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Maybe,No,Don't know,No
2,32,Male,Canada,,No,Rarely,6-25,No,Yes,No,No,No,No,Somewhat difficult,No,Yes,No,No
3,31,Male,United Kingdom,,Yes,Often,26-100,No,Yes,No,Yes,No,No,Somewhat difficult,Yes,Some of them,No,Yes
4,31,Male,United States,,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,No,Some of them,Don't know,No


In [143]:
len(unprocessed_data)

1259

In [144]:
unprocessed_mat = unprocessed_data.values.tolist()
age = []
male_female = []
trans = []

in_states = []

self_employed_yes_no = []
self_employed_na = []

family_history = []
treatment_yes_no =[]

work_interfere_often = []
work_interfere_rarely = []
work_interfere_na = []
work_interfere_sometimes = []
work_interfere_never = []

no_employees = []

# remote work features
remote_work_yes_no = []
# tech company features
tech_company_yes_no = []
# benefits features
benefits_yes_no = []
benefits_dont_know = []
# care options features
care_options_yes_no = []
care_options_not_sure = []
# wellness program features
wellness_yes_no = []
wellness_dont_know = []
# seek help features
seek_help_yes_no = []
seek_help_dont_know = []
# leave features
leave_somewhat_easy = []
leave_somewhat_difficult = []
leave_very_easy = []
leave_very_difficult = []
leave_dont_know = []
# coworker features
coworkers_yes_no = []
coworkers_some_of_them = []
# mental vs. physical features
mental_vs_physical_yes_no = []
mental_vs_physical_dont_know = []

# Output
mental_health_consequence_yes_no = []
mental_health_consequence_maybe = []

obs_consequence = []

for row in range(0, len(unprocessed_mat)):
    # age
    if unprocessed_mat[row][0] < 0 or unprocessed_mat[row][0] > 80:
        age.append(0)
    else:
        age.append(unprocessed_mat[row][0])
    
    # Gender 
    male_labels = ['m','male','male-ish', 'cis man', 'cis male', 'mal', 'male (cis)', 'man', 'malr', 'man', 'msle']
    female_labels = ['f', 'female', 'woman', 'cis female', 'cis-female/femme', 'femail', 'femake']
    if unprocessed_mat[row][1].lower() in male_labels:
        male_female.append(0)
        trans.append(0)
    elif unprocessed_mat[row][1].lower() in female_labels:
        male_female.append(1)
        trans.append(0)
    else:
        male_female.append(0)
        trans.append(1)
    
    # Location
    if unprocessed_mat[row][2] == 'United States':
        in_states.append(1)
    else:
        in_states.append(0)
    
    # self_employed
    if unprocessed_mat[row][3] == 'No':
        self_employed_yes_no.append(0)
        self_employed_na.append(0)
    elif unprocessed_mat[row][3] == 'Yes':
        self_employed_yes_no.append(1)
        self_employed_na.append(0)
    else:
        self_employed_yes_no.append(0)
        self_employed_na.append(1)
        
    if unprocessed_mat[row][4].lower() == 'yes':
        family_history.append(1)
    else:
        family_history.append(0)
        
    if unprocessed_mat[row][5].lower() == "yes":
        treatment_yes_no.append(1)
    else:
        treatment_yes_no.append(0)
    
    # work interfere
    if unprocessed_mat[row][6] == 'Often':
        work_interfere_often.append(1)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif unprocessed_mat[row][6] == 'Rarely':
        work_interfere_often.append(0)
        work_interfere_rarely.append(1)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif unprocessed_mat[row][6] == 'Sometimes':
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(1)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif unprocessed_mat[row][6] == 'Never':
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(1)
        work_interfere_na.append(0)
    else:
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(1)
        
    # number of employees
    if unprocessed_mat[row][7] == 'More than 1000':
        no_employees.append(1000)
    else:
        arr = unprocessed_mat[row][7].split('-')
        no_employees.append(arr[0])
        
    remote_work = unprocessed_mat[row][8].lower()
    if remote_work == "yes":
        remote_work_yes_no.append(1)
    else:
        remote_work_yes_no.append(0)
        
    tech_company = unprocessed_mat[row][9].lower()
    if tech_company == "yes":
        tech_company_yes_no.append(1)
    else:
        tech_company_yes_no.append(0)
    
    benefits = unprocessed_mat[row][10].lower()
    if benefits == "yes":
        benefits_yes_no.append(1)
        benefits_dont_know.append(0)
    elif benefits == "no":
        benefits_yes_no.append(0)
        benefits_dont_know.append(0)
    else:
        benefits_yes_no.append(0)
        benefits_dont_know.append(1)
        
    care_options = unprocessed_mat[row][11].lower()
    if care_options == "yes":
        care_options_yes_no.append(1)
        care_options_not_sure.append(0)
    elif care_options == "no":
        care_options_yes_no.append(0)
        care_options_not_sure.append(0)
    else:
        care_options_yes_no.append(0)
        care_options_not_sure.append(1)
    
    wellness = unprocessed_mat[row][12].lower()
    if wellness == "yes":
        wellness_yes_no.append(1)
        wellness_dont_know.append(0)
    elif wellness == "no":
        wellness_yes_no.append(0)
        wellness_dont_know.append(0)
    else:
        wellness_yes_no.append(0)
        wellness_dont_know.append(1)
        
    seek_help = unprocessed_mat[row][13].lower()
    if seek_help == "yes":
        seek_help_yes_no.append(1)
        seek_help_dont_know.append(0)
    elif seek_help == "no":
        seek_help_yes_no.append(0)
        seek_help_dont_know.append(0)
    else:
        seek_help_yes_no.append(0)
        seek_help_dont_know.append(1)
        
    leave = unprocessed_mat[row][14].lower()
    if leave == "somewhat easy":
        leave_somewhat_easy.append(1)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "somewhat difficult":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(1)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "very easy":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(1)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "very difficult":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(1)
        leave_dont_know.append(0)
    else:
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(1)
        
    coworkers = unprocessed_mat[row][16].lower()
    if coworkers == "yes":
        coworkers_yes_no.append(1)
        coworkers_some_of_them.append(0)
    elif coworkers == "no":
        coworkers_yes_no.append(0)
        coworkers_some_of_them.append(0)
    else:
        coworkers_yes_no.append(0)
        coworkers_some_of_them.append(1)
        
    mental_vs_physical = unprocessed_mat[row][17].lower()
    if mental_vs_physical == "yes":
        mental_vs_physical_yes_no.append(1)
        mental_vs_physical_dont_know.append(0)
    elif mental_vs_physical == "no":
        mental_vs_physical_yes_no.append(0)
        mental_vs_physical_dont_know.append(0)
    else:
        mental_vs_physical_yes_no.append(0)
        mental_vs_physical_dont_know.append(1)
        
    mental_health_consequence = unprocessed_mat[row][15].lower()
    if mental_health_consequence == "yes":
        mental_health_consequence_yes_no.append(1)
        mental_health_consequence_maybe.append(0)
    elif mental_health_consequence == "no":
        mental_health_consequence_yes_no.append(0)
        mental_health_consequence_maybe.append(0)
    else:
        mental_health_consequence_yes_no.append(0)
        mental_health_consequence_maybe.append(1)
    
    obs = mental_health_consequence = unprocessed_mat[row][18].lower()
    
    if obs == "yes":
        obs_consequence.append(1)
    else:
        obs_consequence.append(0)
        

In [145]:
training_mat = np.column_stack((#age,
                                #male_female,
                                trans,
#                                 in_states,
#                                 self_employed_yes_no,
#                                 self_employed_na,
#                                 family_history,
#                                 treatment_yes_no,
#                                 work_interfere_often,
#                                 work_interfere_rarely,
                                 work_interfere_na,
#                                 work_interfere_sometimes,
#                                 work_interfere_never,
#                                 no_employees,
#                                 remote_work_yes_no,
#                                 tech_company_yes_no,
#                                 benefits_yes_no,
#                                 benefits_dont_know,
#                                 care_options_yes_no,
#                                 care_options_not_sure,
#                                 wellness_yes_no,
                                wellness_dont_know,
#                                 seek_help_yes_no,
#                                 seek_help_dont_know,
#                                 leave_somewhat_easy,
                                leave_somewhat_difficult,
#                                 leave_very_easy,
                                leave_very_difficult,
#                                 leave_dont_know,
                                coworkers_yes_no,
                                coworkers_some_of_them,
                                mental_vs_physical_yes_no,
                                mental_vs_physical_dont_know,
                                obs_consequence))

In [146]:
training_mat

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [147]:
len(training_mat[0])

10

In [162]:
output_mat = np.column_stack((mental_health_consequence_yes_no,
                              mental_health_consequence_maybe))
output_mat

len(output_mat)



1259

In [168]:

X_train, X_test, y_train, y_test = train_test_split(training_mat, output_mat, test_size=0.1, random_state=42)


# mdl = LogisticRegression()

# rfe = RFE(mdl, 10)
# rfe = rfe.fit(training_mat, mental_health_consequence_yes_no)

# print rfe.support_
# print rfe.ranking_

# for i in range(0, len(rfe.ranking_)):
#     if rfe.ranking_[i] == 1:
#         print i
        
# print "_____"
# rfe = rfe.fit(training_mat, mental_health_consequence_maybe)
# for i in range(0, len(rfe.ranking_)):
#     if rfe.ranking_[i] == 1:
#         print i

In [169]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [170]:
predicted = rf.predict(X_test)

rf.feature_importances_

array([ 0.05151241,  0.07882251,  0.07540037,  0.07632267,  0.10327742,
        0.14414929,  0.13977055,  0.13893071,  0.09586299,  0.09595107])

In [171]:
count = 0
for i in range(len(predicted)):
    if predicted[i][0] == y_test[i][0] and predicted[i][1] == y_test[i][1]:
        count+=1

count

74

In [172]:
precision = 1.0*count/len(y_test)
precision

0.5873015873015873

In [173]:
from sklearn.tree import export_graphviz
import os
export_graphviz(rf.estimators_[0],
                out_file="tree.dot")
os.system('dot -Tpng tree.dot -o tree.png')

0