In [210]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression



In [211]:
cols = ['Age', 
        'Gender',
        'Country',
        'self_employed',
        'family_history',
        'treatment',
        'work_interfere', 
        'no_employees',
        'remote_work',
        'tech_company',
        'benefits',
        'care_options',
        'wellness_program',
        'seek_help',
        'leave',
        'coworkers',
        'mental_vs_physical', 
        'mental_health_consequence',
        'obs_consequence']

unprocessed_data = pd.read_csv('survey.csv', usecols=cols)

In [212]:
unprocessed_data.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,leave,mental_health_consequence,coworkers,mental_vs_physical,obs_consequence
0,37,Female,United States,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Somewhat easy,No,Some of them,Yes,No
1,44,M,United States,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Maybe,No,Don't know,No
2,32,Male,Canada,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Somewhat difficult,No,Yes,No,No
3,31,Male,United Kingdom,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,Somewhat difficult,Yes,Some of them,No,Yes
4,31,Male,United States,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,No,Some of them,Don't know,No


In [213]:
len(unprocessed_data)


1259

In [214]:
unprocessed_mat = unprocessed_data.values.tolist()
#Age feature
age = []

# Gender features
male_female = []
trans = []

# Location feature
in_states = []

# Employment type features
self_employed_yes_no = []
self_employed_na = []

# Family History feature
family_history = []

# Treatment feature
treatment_yes_no =[]

# Work interference features
work_interfere_often = []
work_interfere_rarely = []
work_interfere_na = []
work_interfere_sometimes = []
work_interfere_never = []

# Company Size feature
no_employees = []

# Remote work features
remote_work_yes_no = []

# Tech company features
tech_company_yes_no = []

# Benefits features
benefits_yes_no = []
benefits_dont_know = []

# Care options features
care_options_yes_no = []
care_options_not_sure = []

# Wellness program features
wellness_yes_no = []
wellness_dont_know = []

# Seek help features
seek_help_yes_no = []
seek_help_dont_know = []

# Leave features
leave_somewhat_easy = []
leave_somewhat_difficult = []
leave_very_easy = []
leave_very_difficult = []
leave_dont_know = []

# Coworker features
coworkers_yes_no = []
coworkers_some_of_them = []

# Mental vs. physical features
mental_vs_physical_yes_no = []
mental_vs_physical_dont_know = []

# Observed coworker behaviour feature
obs_consequence = []

# Output
mental_health_consequence_yes_no = []


for row in range(0, len(unprocessed_mat)):
    # Age
    age_val = unprocessed_mat[row][0]
    if age_val < 0 or age_val > 80:
        age.append(0)
    else:
        age.append(unprocessed_mat[row][0])
    
    # Gender 
    male_labels = ['m','male','male-ish', 'cis man', 'cis male', 'mal', 'male (cis)', 'man', 'malr', 'man', 'msle']
    female_labels = ['f', 'female', 'woman', 'cis female', 'cis-female/femme', 'femail', 'femake']
    
    gender_val = unprocessed_mat[row][1].lower()
    if gender_val in male_labels:
        male_female.append(0)
        trans.append(0)
    elif gender_val in female_labels:
        male_female.append(1)
        trans.append(0)
    else:
        male_female.append(0)
        trans.append(1)
    
    # Location
    location_val = unprocessed_mat[row][2].lower()
    if location_val == 'united states':
        in_states.append(1)
    else:
        in_states.append(0)
    
    # Self_employed
    self_employed_val = unprocessed_mat[row][3]
    if self_employed_val == 'No':
        self_employed_yes_no.append(0)
        self_employed_na.append(0)
    elif self_employed_val == 'Yes':
        self_employed_yes_no.append(1)
        self_employed_na.append(0)
    else:
        self_employed_yes_no.append(0)
        self_employed_na.append(1)
    
    # Family History
    family_history_val = unprocessed_mat[row][4].lower()
    if family_history_val == 'yes':
        family_history.append(1)
    else:
        family_history.append(0)
    
    # Treatment 
    treatment_val = unprocessed_mat[row][5].lower()
    if treatment_val == "yes":
        treatment_yes_no.append(1)
    else:
        treatment_yes_no.append(0)
    
    # Work interference
    work_interfere_val = unprocessed_mat[row][6]
    if work_interfere_val == 'Often':
        work_interfere_often.append(1)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif work_interfere_val == 'Rarely':
        work_interfere_often.append(0)
        work_interfere_rarely.append(1)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif work_interfere_val == 'Sometimes':
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(1)
        work_interfere_never.append(0)
        work_interfere_na.append(0)
    elif work_interfere_val == 'Never':
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(1)
        work_interfere_na.append(0)
    else:
        work_interfere_often.append(0)
        work_interfere_rarely.append(0)
        work_interfere_sometimes.append(0)
        work_interfere_never.append(0)
        work_interfere_na.append(1)
        
    # Company size
    no_employees_val = unprocessed_mat[row][7].lower()
    if no_employees_val == 'more than 1000':
        no_employees.append(1000)
    else:
        arr = no_employees_val.split('-')
        no_employees.append(arr[0])
    
    # Remote work
    remote_work = unprocessed_mat[row][8].lower()
    if remote_work == "yes":
        remote_work_yes_no.append(1)
    else:
        remote_work_yes_no.append(0)
    
    # Company type
    tech_company = unprocessed_mat[row][9].lower()
    if tech_company == "yes":
        tech_company_yes_no.append(1)
    else:
        tech_company_yes_no.append(0)
    
    # Benefits
    benefits = unprocessed_mat[row][10].lower()
    if benefits == "yes":
        benefits_yes_no.append(1)
        benefits_dont_know.append(0)
    elif benefits == "no":
        benefits_yes_no.append(0)
        benefits_dont_know.append(0)
    else:
        benefits_yes_no.append(0)
        benefits_dont_know.append(1)
    
    # Care options
    care_options = unprocessed_mat[row][11].lower()
    if care_options == "yes":
        care_options_yes_no.append(1)
        care_options_not_sure.append(0)
    elif care_options == "no":
        care_options_yes_no.append(0)
        care_options_not_sure.append(0)
    else:
        care_options_yes_no.append(0)
        care_options_not_sure.append(1)
    
    # Wellness
    wellness = unprocessed_mat[row][12].lower()
    if wellness == "yes":
        wellness_yes_no.append(1)
        wellness_dont_know.append(0)
    elif wellness == "no":
        wellness_yes_no.append(0)
        wellness_dont_know.append(0)
    else:
        wellness_yes_no.append(0)
        wellness_dont_know.append(1)
    
    # Seeked help
    seek_help = unprocessed_mat[row][13].lower()
    if seek_help == "yes":
        seek_help_yes_no.append(1)
        seek_help_dont_know.append(0)
    elif seek_help == "no":
        seek_help_yes_no.append(0)
        seek_help_dont_know.append(0)
    else:
        seek_help_yes_no.append(0)
        seek_help_dont_know.append(1)
    
    # Leave
    leave = unprocessed_mat[row][14].lower()
    if leave == "somewhat easy":
        leave_somewhat_easy.append(1)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "somewhat difficult":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(1)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "very easy":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(1)
        leave_very_difficult.append(0)
        leave_dont_know.append(0)
    elif leave == "very difficult":
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(1)
        leave_dont_know.append(0)
    else:
        leave_somewhat_easy.append(0)
        leave_somewhat_difficult.append(0)
        leave_very_easy.append(0)
        leave_very_difficult.append(0)
        leave_dont_know.append(1)
        
    # Output
    mental_health_consequence = unprocessed_mat[row][15].lower()
    if mental_health_consequence == "yes":
        mental_health_consequence_yes_no.append(1)
    else:
        mental_health_consequence_yes_no.append(0)
    
    # Coworkers
    coworkers = unprocessed_mat[row][16].lower()
    if coworkers == "yes":
        coworkers_yes_no.append(1)
        coworkers_some_of_them.append(0)
    elif coworkers == "no":
        coworkers_yes_no.append(0)
        coworkers_some_of_them.append(0)
    else:
        coworkers_yes_no.append(0)
        coworkers_some_of_them.append(1)
    
    # Mental vs physical 
    mental_vs_physical = unprocessed_mat[row][17].lower()
    if mental_vs_physical == "yes":
        mental_vs_physical_yes_no.append(1)
        mental_vs_physical_dont_know.append(0)
    elif mental_vs_physical == "no":
        mental_vs_physical_yes_no.append(0)
        mental_vs_physical_dont_know.append(0)
    else:
        mental_vs_physical_yes_no.append(0)
        mental_vs_physical_dont_know.append(1)
        
    # Observed behaviour
    obs = unprocessed_mat[row][18].lower()
    
    if obs == "yes":
        obs_consequence.append(1)
    else:
        obs_consequence.append(0)
        

In [215]:
training_mat = np.column_stack((age,
                                male_female,
                                trans,
                                in_states,
                                self_employed_yes_no,
                                self_employed_na,
                                family_history,
                                treatment_yes_no,
                                work_interfere_often,
                                work_interfere_rarely,
                                work_interfere_na,
                                work_interfere_sometimes,
                                work_interfere_never,
                                no_employees,
                                remote_work_yes_no,
                                tech_company_yes_no,
                                benefits_yes_no,
                                benefits_dont_know,
                                care_options_yes_no,
                                care_options_not_sure,
                                wellness_yes_no,
                                wellness_dont_know,
                                seek_help_yes_no,
                                seek_help_dont_know,
                                leave_somewhat_easy,
                                leave_somewhat_difficult,
                                leave_very_easy,
                                leave_very_difficult,
                                leave_dont_know,
                                coworkers_yes_no,
                                coworkers_some_of_them,
                                mental_vs_physical_yes_no,
                                mental_vs_physical_dont_know,
                                obs_consequence))

features = [
    "age",
    "male_female",
    "trans",
    "in_states",
    "self_employed_yes_no",
    "self_employed_na",
    "family_history",
    "treatment",
    "work_interfere_often",
    "work_interfere_rarely",
    "work_interfere_na",
    "work_interfere_sometimes",
    "work_interfere_never",
    "no_employees",
    "remote_work_yes_no",
    "tech_company_yes_no",
    "benefits_yes_no",
    "benefits_dont_know",
    "care_options_yes_no",
    "care_options_not_sure",
    "wellness_yes_no",
    "wellness_dont_know",
    "seek_help_yes_no",
    "seek_help_dont_know",
    "leave_somewhat_easy",
    "leave_somewhat_difficult",
    "leave_very_easy",
    "leave_very_difficult",
    "leave_dont_know",
    "coworkers_yes_no",
    "coworkers_some_of_them",
    "mental_vs_physical_yes_no",
    "mental_vs_physical_dont_know",
    "obs_consequence"
]

In [216]:
training_mat

array([['37', '1', '0', ..., '1', '0', '0'],
       ['44', '0', '0', ..., '0', '1', '0'],
       ['32', '0', '0', ..., '0', '0', '0'],
       ..., 
       ['34', '0', '0', ..., '0', '0', '0'],
       ['46', '1', '0', ..., '0', '0', '0'],
       ['25', '0', '0', ..., '0', '1', '0']], 
      dtype='|S21')

In [217]:
len(training_mat[0])

34

In [218]:
output_mat = mental_health_consequence_yes_no

output_mat

len(output_mat)



1259

In [219]:
mdl = LogisticRegression()

rfe = RFE(mdl, 10)
rfe = rfe.fit(training_mat, output_mat)

print rfe.support_
print rfe.ranking_

training_mat_imp = np.zeros((1259, 10))
extracted_features = []

counter = 0;
for i in range(0, len(rfe.ranking_)):
    if rfe.ranking_[i] == 1:
        training_mat_imp[:,counter] = training_mat[:,i]
        extracted_features.append(features[i])
        counter += 1
        
print extracted_features

X_train, X_test, y_train, y_test = train_test_split(training_mat_imp, output_mat, test_size=0.1, random_state=42)


[False False  True False False False False False False False  True False
 False False False False False False False False False  True False False
 False  True False  True False  True  True  True  True  True]
[24 13  1 20  7  2 22 21  5 18  1  6 23 25  3 15 16 14 10  9  8  1 11 17  4
  1 12  1 19  1  1  1  1  1]
['trans', 'work_interfere_na', 'wellness_dont_know', 'leave_somewhat_difficult', 'leave_very_difficult', 'coworkers_yes_no', 'coworkers_some_of_them', 'mental_vs_physical_yes_no', 'mental_vs_physical_dont_know', 'obs_consequence']


In [220]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [221]:
predicted = rf.predict(X_test)

rf.feature_importances_

array([ 0.04996238,  0.05806639,  0.05233837,  0.06993602,  0.13066783,
        0.1504737 ,  0.16140777,  0.12639489,  0.10364756,  0.09710508])

In [222]:
count = 0
for i in range(len(predicted)):
    if predicted[i] == y_test[i]:
        count+=1

count

107

In [226]:
precision = 1.0*count/len(y_test)
precision
len(rf.estimators_)

500

from sklearn.tree import export_graphviz
import os
export_graphviz(rf.estimators_[499],
                feature_names=extracted_features,
                out_file="tree.dot")
os.system('dot -Tpng tree.dot -o tree.png')