# Graduate Project - CSCI E-29

## Using Python 3 and Machine Learning Functionality to build A model to Predict Mental Health Status of US Veterans of the Vietnam War
### Danielle Crumley





https://github.com/daniellecrumley/CSCI-E-29-Graduate-Project

### Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_predict


## Reading in raw data from text file into usable format
### See data dictionary for description

In [2]:
with open ('08446-0001-Data.txt', 'r') as f:
    raw_data = f.readlines()
raw_data = np.array(raw_data)
raw_data

array(['01   1 1 2 2       9      35       9      55   3  15   1 1 3 2 2 2 2 2 2        \n',
       '02   1      6971 2 2 1 5   2 2 2 2 2 2       1 1 1 1 6 545 1 3 1 1 2 2 0 4 5    \n',
       '03   1 1 1 0     0 2                                           0   0     0 0    \n',
       ...,
       '07 757  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0     \n',
       '08 757  0  0  0  0  0  0  0  0  3  1  1  0  1  0  0  3999 19  0  0  0           \n',
       '09 757  0  0  3  0  0  0  1  1  1  1  9  1  1                                   \n'],
      dtype='<U81')

In [3]:
raw_data.shape

(11241,)

In [4]:
raw_data_by_resp = raw_data.reshape(int(11241/9), -1)
raw_data_by_resp = pd.DataFrame(raw_data_by_resp)

# this is the data from the first respondent
raw_data_by_resp.iloc[0, :]

0    01   1 1 2 2       9      35       9      55  ...
1    02   1      6971 2 2 1 5   2 2 2 2 2 2       1...
2    03   1 1 1 0     0 2                          ...
3    04   1 0 0 0                      1 0         ...
4    05   1 9 2 3 2 1 2 2 2 1 1  0  0  0  0  0  0  ...
5    06   1  0  0  0  0  0  0  0  0  0  1  0  0  0 ...
6    07   1  0  0  0  0  0  0  0  0  0  0  0  0  0 ...
7    08   1  0  0  0  0  0  0  0  0  2  1  2  0  1 ...
8    09   1  0  0  2  0  0  0  1  2  2  1  2  1  1 ...
Name: 0, dtype: object

### Variable info from Data Dictionary

In [5]:
# Deck 1
deck1_varnames = ['deck', 'ID','card', 'time_zone', 'vet_type',
                                           'hr_begun', 'min_begun', 'hr_ended', 'min_ended',
                                           'month', 'day', 'completed_attempt_no',
                                           'Q1', 'Q2', 'Q2a', 'Q2b', 'Q2c', 'Q2d', 'Q2e', 'Q2f']
deck1_varlens = [2,4,2,2,2,8,8,8,8,4,4,4,2,2,2,2,2,2,2,2]

# Deck 2
deck2_varnames = ['deck', 'ID','Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12',
                 'Q12a', 'Q12b', 'Q12c', 'Q12d', 'Q12e', 'Q12f', 'Q12g', 'Q12h', 'Q12i',
                 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24',
                 'Q25', 'Q26','Q27', 'Q28']
deck2_varlens = [2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

# Deck 3
deck3_varnames = ['deck', 'ID','Q29', 'Q30', 'Q31', 'Q32', 'Q33', 'Q34', 'Q35', 'Q36a', 'Q36b', 'Q36c',
                 'Q36d', 'Q36e', 'Q37', 'Q38a', 'Q38b', 'Q38c', 'Q38d', 'Q39', 'Q40a',
                 'Q40b', 'Q40c', 'Q40d', 'Q40e', 'Q40f', 'Q40g', 'Q40h', 'Q40i', 'Q41', 'Q42', 'Q43', 'Q44',
                 'Q45', 'Q46','Q47', 'Q48']
deck3_varlens = [2,4]
for i in range(len(deck3_varnames)-2):
    deck3_varlens.append(2)
    
# Deck 4
deck4_varnames = ['deck', 'ID','Q49', 'Q50', 'Q51', 'Q52', 'Q53', 'Q54a1', 'Q54b1', 'Q54c1', 'Q54d1', 'Q54e1',
                 'Q54f1', 'Q54g1', 'Q54h1', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60',
                 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q54a2', 'Q54b2', 'Q54c2', 'Q54d2', 'Q54e2',
                 'Q54f2', 'Q54g2', 'Q54h2','Q67']
deck4_varlens = [2,4, 2, 2, 2, 2, 3]
for i in range(len(deck4_varnames)-7):
    deck4_varlens.append(2)

# Deck 5
deck5_varnames = ['deck', 'ID','Q68','Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74', 'Q75', 'Q76', 'Q79',
                 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
                 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16']
deck5_varlens = [2,4]
for i in range(10):
    deck5_varlens.append(2)
for i in range(16):
    deck5_varlens.append(3)

# Deck 6
deck6_varnames = ['deck', 'ID',
                 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25',
                 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35',
                 'V36', 'V37', 'V38', 'V39']
deck6_varlens = [2,4]
for i in range(len(deck6_varnames)-2):
    deck6_varlens.append(3)
    
# Deck 7 
deck7_varnames = ['deck', 'ID']
for i in range(40, 63):
    deck7_varnames.append('V'+ str(i))
    
deck7_varlens = [2,4]
for i in range(len(deck7_varnames)-2):
    deck7_varlens.append(3)
    
# DECK 8 
deck8_varnames = ['deck', 'ID',
                 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'Combat1', 'Age',
                 'New11', 'New11a', 'New26', 'Combat', 'New47', 'New53', 'Years2', 'Ageent', 'Ageent2',
                 'V70', 'V71', 'V72']
deck8_varlens = [2,4]
for i in range(len(deck8_varnames)-2):
    deck8_varlens.append(3)
    
# DECK 9
deck9_varnames = ['deck', 'ID',
                 'V73', 'V74', 'Temp4']
for i in range(1,11):
    deck9_varnames.append('Stub'+ str(i))
    
deck9_varlens = [2,4]
for i in range(len(deck9_varnames)-2):
    deck9_varlens.append(3)

In [6]:
deckdata = pd.DataFrame()
for i in range(9):
    deckdata['deck' + str(i+1) + 'data'] = raw_data_by_resp.apply(lambda row:row[i], axis=1)

deckdata.shape
deckdata.head()

Unnamed: 0,deck1data,deck2data,deck3data,deck4data,deck5data,deck6data,deck7data,deck8data,deck9data
0,01 1 1 2 2 9 35 9 55 ...,02 1 6971 2 2 1 5 2 2 2 2 2 2 1...,03 1 1 1 0 0 2 ...,04 1 0 0 0 1 0 ...,05 1 9 2 3 2 1 2 2 2 1 1 0 0 0 0 0 0 ...,06 1 0 0 0 0 0 0 0 0 0 1 0 0 0 ...,07 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,08 1 0 0 0 0 0 0 0 0 2 1 2 0 1 ...,09 1 0 0 2 0 0 0 1 2 2 1 2 1 1 ...
1,01 2 1 3 2 10 30 11 0 ...,02 2 7276 2 2 4 2 2 2 2 2 2 3...,03 2 1 2 2 0 1 ...,04 2 0 0 0 2 ...,05 2 9 2 3 3 2 1 1 2 1 1 0 0 0 0 0 0 ...,06 2 0 0 0 0 0 0 0 0 0 1 0 0 0 ...,07 2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,08 2 0 0 0 0 0 0 0 0 3 1 1 1 1 ...,09 2 0 0 4 0 0 0 2 1 1 9 9 1 1 ...
2,01 3 1 3 2 10 25 10 45 ...,02 3 6971 1 2 5 2 2 2 2 2 2 2...,03 3 1 1 0 0 2 ...,04 3 0 0 0 1 0 ...,05 3 2 2 3 4 2 1 9 1 1 1 0 0 0 0 0 0 ...,06 3 0 0 0 0 0 0 0 0 0 1 0 0 0 ...,07 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,08 3 0 0 0 0 0 0 0 0 3 1 2 0 1 ...,09 3 0 0 2 0 0 0 1 2 2 9 9 1 1 ...
3,01 4 1 3 2 11 5 11 20 ...,02 4 6569 1 2 2 5 2 2 2 2 2 2 2...,03 4 1 1 0 0 1 ...,04 4 0 0 0 9 0 ...,05 4 1 1 2 1 1 2 1 2 1 1 0 0 0 0 0 0 ...,06 4 0 0 0 0 0 0 0 0 0 1 0 0 0 ...,07 4 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,08 4 0 0 0 0 0 0 0 0 2 1 2 0 1 ...,09 4 0 0 4 0 0 0 2 2 2 1 2 1 1 ...
4,01 5 1 4 2 11 5 11 15 ...,02 5 6366 1 1 1 5 2 2 2 2 2 2 2...,03 5 1 1 0 0 1 ...,04 5 0 0 0 2 ...,05 5 1 1 3 1 1 2 2 2 1 2 0 0 0 0 0 0 ...,06 5 0 0 0 0 0 0 0 0 0 1 0 0 0 ...,07 5 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,08 5 0 0 0 0 0 0 0 0 1 1 2 0 1 ...,09 5 0 0 3 0 0 0 1 2 2 2 3 1 1 ...


In [7]:
for i, deck_varlens in enumerate([deck1_varlens, deck2_varlens, deck3_varlens, deck4_varlens, deck5_varlens,
             deck6_varlens, deck7_varlens, deck8_varlens, deck9_varlens]):
    
    var_indices=[0]
    next_index=0
    for varlen in deck_varlens:
        next_index +=varlen
        var_indices.append(next_index)
    
    func = lambda str_data: [str_data[var_indices[i]:var_indices[i+1]] for i in range(len(var_indices)-1)]
    deckdata.loc[:]['deck' + str(i+1) + 'data'] = deckdata.loc[:]['deck' + str(i+1) + 'data'].apply(func)
#display(deckdata.head())
#deckdata.shape

### Finally a recognizable DataFrame!

In [8]:
all_varnames = [deck1_varnames, deck2_varnames, deck3_varnames, deck4_varnames, deck5_varnames,
             deck6_varnames, deck7_varnames, deck8_varnames, deck9_varnames]
data_df = pd.DataFrame(columns=['ID'])

for col, varnames in zip(deckdata, all_varnames):
    df = pd.DataFrame()
    df = pd.DataFrame(deckdata[col].values.tolist(), columns=varnames)
    df = df.applymap(lambda val: val.replace(' ', ''))
    df = df.applymap(lambda x: np.nan if isinstance(x, str) and x=='' else x)
    df = df.fillna('9999')
    df = df.applymap(lambda val: int(val))
    df.drop(columns='deck', inplace=True)
    data_df.drop_duplicates(subset='ID', inplace=True)
    df.drop_duplicates(subset='ID', inplace=True)
    data_df = pd.merge(data_df, df, on='ID', how='outer')

data_df.drop(columns='ID', inplace=True)
    
display(data_df.head())
data_df.shape

Unnamed: 0,card,time_zone,vet_type,hr_begun,min_begun,hr_ended,min_ended,month,day,completed_attempt_no,...,Stub1,Stub2,Stub3,Stub4,Stub5,Stub6,Stub7,Stub8,Stub9,Stub10
0,1,2,2,9,35,9,55,3,15,1,...,0,0,0,1,2,2,1,2,1,1
1,1,3,2,10,30,11,0,3,15,1,...,0,0,0,2,1,1,9,9,1,1
2,1,3,2,10,25,10,45,3,15,1,...,0,0,0,1,2,2,9,9,1,1
3,1,3,2,11,5,11,20,3,15,1,...,0,0,0,2,2,2,1,2,1,1
4,1,4,2,11,5,11,15,3,15,1,...,0,0,0,1,2,2,2,3,1,1


(1247, 227)

### Rename columns in data_df:


In [9]:
d = {
    'Q2a':'Q2a_Vietnam',
    'Q76':'Q76_Sex',
    'Q9':'Q9_EmployedPremilitary',
    'Q18':'Q18_PreMilEducation',
    'Q19':'Q19_YearBorn',
    'Q20':'Q20_SocialClass',
    'Q23':'Q23_MaritalStatus',
    'Q27':'Q27_Race', 
    'Q28':'Q28_Income',
    'Q5':'Q5_MonthsInVN',
    'Q8':'Q8_EnlistOrDraft',
    'Q10':'Q10_MilBranch',
    'Q11':'Q11_MilitaryRank',
    'Q36a':'Q36a_VNBigEffect',
    'Q41':'PresentVNKilled',
    'Q43':'Q43_PresentWhenAmKilled',
    'Q45':'Q45_Killed',
    'Q46':'Q46_Wounded',
    'Q38a':'Q38a_JustifiedBomb',
    'Q38b':'Q38b_JustifiedNapalm',
    'Q38c': 'Q38c_JustifiedChemicals',
    'Q38d':'Q38d_JustfiedDestruc',
    'Q39':'Q39_KnowWhyWar',
    'Q61':'Q61_WorthwhileCause',
    'Q62':'Q62_TrustUSThen',
    'Q54a1':'Q54a1_ProbJobThen',
    'Q54b1':'Q54b1_ProbHealthThen',
    'Q54c1':'Q54c1_ProbMoneyThen',
    'Q54d1':'Q54d1_ProbLonelinessThen',
    'Q54e1':'Q54e1_ProbDrinkingThen',
    'Q54f1':'Q54f1_ProbSocialThen',
    'Q54g1':'Q54g1_ProbEmotionThen',
    'Q54h1':'Q54h1_ProbDrugsThen',
    'Q36b':'Q36b_HappyVetStatus',
    'Q36c':'Q36c_ThinkAboutDeath',
    'Q36e':'Q36e_DreamsInVN'}

data_df = data_df[list(d.keys())]
data_df.rename(columns=d, inplace=True)
display(data_df.head())
data_df.shape

Unnamed: 0,Q2a_Vietnam,Q76_Sex,Q9_EmployedPremilitary,Q18_PreMilEducation,Q19_YearBorn,Q20_SocialClass,Q23_MaritalStatus,Q27_Race,Q28_Income,Q5_MonthsInVN,...,Q54b1_ProbHealthThen,Q54c1_ProbMoneyThen,Q54d1_ProbLonelinessThen,Q54e1_ProbDrinkingThen,Q54f1_ProbSocialThen,Q54g1_ProbEmotionThen,Q54h1_ProbDrugsThen,Q36b_HappyVetStatus,Q36c_ThinkAboutDeath,Q36e_DreamsInVN
0,2,1,2,5,45,1,1,4,5,9999,...,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999
1,2,1,2,3,52,2,5,4,2,9999,...,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999
2,2,1,2,4,49,1,5,4,4,9999,...,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999
3,2,1,2,3,46,2,1,4,5,9999,...,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999
4,2,1,1,3,44,1,1,4,6,9999,...,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999


(1247, 36)

In [10]:
# Only include Vietnam War Veterans
data_df = data_df[data_df.Q2a_Vietnam == 1]
display(data_df.head())
display(data_df.shape)

Unnamed: 0,Q2a_Vietnam,Q76_Sex,Q9_EmployedPremilitary,Q18_PreMilEducation,Q19_YearBorn,Q20_SocialClass,Q23_MaritalStatus,Q27_Race,Q28_Income,Q5_MonthsInVN,...,Q54b1_ProbHealthThen,Q54c1_ProbMoneyThen,Q54d1_ProbLonelinessThen,Q54e1_ProbDrinkingThen,Q54f1_ProbSocialThen,Q54g1_ProbEmotionThen,Q54h1_ProbDrugsThen,Q36b_HappyVetStatus,Q36c_ThinkAboutDeath,Q36e_DreamsInVN
7,1,1,4,3,48,1,1,4,6,21,...,2,1,1,1,1,1,2,1,1,1
8,1,1,2,4,44,2,1,4,4,99,...,2,2,2,2,2,2,2,2,2,2
9,1,1,3,2,51,2,1,4,3,13,...,2,1,1,1,2,1,1,2,2,2
11,1,1,2,3,47,2,1,1,4,14,...,2,1,1,1,1,1,2,1,1,1
12,1,1,2,4,47,1,1,4,5,12,...,2,2,2,1,2,2,2,2,2,2


(737, 36)

In [11]:
X_demographics = [
 'Q2a_Vietnam',
 'Q76_Sex',
 'Q9_EmployedPremilitary',
 'Q18_PreMilEducation',
 'Q19_YearBorn',
 'Q20_SocialClass',
 'Q23_MaritalStatus',
 'Q27_Race',
 'Q28_Income']

X_military = [
 'Q5_MonthsInVN',
 'Q8_EnlistOrDraft',
 'Q10_MilBranch',
 'Q11_MilitaryRank',
 'Q36a_VNBigEffect',
 'PresentVNKilled',
 'Q43_PresentWhenAmKilled',
 'Q45_Killed',
 'Q46_Wounded']

X_opinions=[
 'Q38a_JustifiedBomb',
 'Q38b_JustifiedNapalm',
 'Q38c_JustifiedChemicals',
 'Q38d_JustfiedDestruc',
 'Q39_KnowWhyWar',
 'Q61_WorthwhileCause',
 'Q62_TrustUSThen']

X_problems=['Q54a1_ProbJobThen', 'Q54b1_ProbHealthThen', 'Q54c1_ProbMoneyThen']

y_cols = ['Q54d1_ProbLonelinessThen',
 'Q54e1_ProbDrinkingThen',
 'Q54f1_ProbSocialThen',
 'Q54g1_ProbEmotionThen',
 'Q54h1_ProbDrugsThen',
 'Q36b_HappyVetStatus',
 'Q36c_ThinkAboutDeath',
 'Q36e_DreamsInVN']

# Y variable feature selection

In [12]:
y_df = data_df.loc[:,y_cols]
display(y_df.head())
y_df.shape

Unnamed: 0,Q54d1_ProbLonelinessThen,Q54e1_ProbDrinkingThen,Q54f1_ProbSocialThen,Q54g1_ProbEmotionThen,Q54h1_ProbDrugsThen,Q36b_HappyVetStatus,Q36c_ThinkAboutDeath,Q36e_DreamsInVN
7,1,1,1,1,2,1,1,1
8,2,2,2,2,2,2,2,2
9,1,1,2,1,1,2,2,2
11,1,1,1,1,2,1,1,1
12,2,1,2,2,2,2,2,2


(737, 8)

In [13]:
# FOR ALL OF THESE, NOW CODING SO THAT 1=PROBLEM (INDICATOR OF WORSE MENTAL HEALTH)
# originally 1 = problem, 2= no, (9, 0, 9999 are missing)
# 0 will be no, 1 will be yes, missing will be replaced with 0

for col in y_df:
    y_df[col].replace({2:0, 9:0, 9999:0}, inplace=True)
    #display(y_df[col].value_counts())


## Manually Engineering Mental Health Summary Score

In [14]:
# MentalHealthScore will be the sum of all the binary responses from the current y_df columns
# so higher score theoretically indicates worse mental health.

cols = y_df.columns.tolist()
y_df.loc[:, 'MentalHealthScore'] = y_df[cols[0]]
for col in cols[1:]:
    y_df['MentalHealthScore'] += y_df[col]

y_df.head()


Unnamed: 0,Q54d1_ProbLonelinessThen,Q54e1_ProbDrinkingThen,Q54f1_ProbSocialThen,Q54g1_ProbEmotionThen,Q54h1_ProbDrugsThen,Q36b_HappyVetStatus,Q36c_ThinkAboutDeath,Q36e_DreamsInVN,MentalHealthScore
7,1,1,1,1,0,1,1,1,7
8,0,0,0,0,0,0,0,0,0
9,1,1,0,1,1,0,0,0,4
11,1,1,1,1,0,1,1,1,7
12,0,1,0,0,0,0,0,0,1


In [15]:
display(y_df.MentalHealthScore.value_counts())
# Binarize- 2 options
y_df.loc[:,'MentalHealthScore2'] = y_df.MentalHealthScore.apply(lambda num:0 if num < 2 else 1)
y_df.loc[:,'MentalHealthScore'] = y_df.MentalHealthScore.apply(lambda num:0 if num < 3 else 1)
display(y_df.MentalHealthScore.value_counts())
display(y_df.MentalHealthScore2.value_counts())


0    155
1    150
2    116
3     86
5     78
4     71
6     51
7     22
8      8
Name: MentalHealthScore, dtype: int64

0    421
1    316
Name: MentalHealthScore, dtype: int64

1    432
0    305
Name: MentalHealthScore2, dtype: int64

## PCA to Combine Data into One Mental Health Outcome Score?

In [16]:
pca = PCA(n_components=1)

y_pcomponents = pca.fit_transform(y_df.iloc[:,:8])
#print(pca.explained_variance_ratio_)
principalDf = pd.DataFrame(data=y_pcomponents, columns=['y_principal_component'])
#display(principalDf)
#print(np.percentile(principalDf.y_principal_component,[50]))

principalDf.loc[:,'y_principal_component']=\
principalDf.loc[:,'y_principal_component'].apply(lambda num:0 if num < -.186 else 1)
principalDf.y_principal_component.value_counts()


0    383
1    354
Name: y_principal_component, dtype: int64

# X Variables: Demographics

In [17]:
X_demographics_df = data_df.loc[:,X_demographics]
display(X_demographics_df.head())
X_demographics_df.shape

Unnamed: 0,Q2a_Vietnam,Q76_Sex,Q9_EmployedPremilitary,Q18_PreMilEducation,Q19_YearBorn,Q20_SocialClass,Q23_MaritalStatus,Q27_Race,Q28_Income
7,1,1,4,3,48,1,1,4,6
8,1,1,2,4,44,2,1,4,4
9,1,1,3,2,51,2,1,4,3
11,1,1,2,3,47,2,1,1,4
12,1,1,2,4,47,1,1,4,5


(737, 9)

In [18]:
#X_demographics_df.Q2a_Vietnam.value_counts()
#X_demographics_df.Q76_Sex.value_counts()
X_demographics_df.drop(columns=['Q2a_Vietnam','Q76_Sex'], inplace=True)
X_demographics_df.shape

(737, 7)

In [19]:
# Q9_EmployedPremilitary
# Q9: 1=in school, 2=working fulltime, 3=unemployed, 4=parttime work, 5 or 9 considered missing
#display(X_demographics_df.Q9_EmployedPremilitary.value_counts())

# group part time and fulltime employment; fill missing with most common response (2, working)
X_demographics_df.Q9_EmployedPremilitary.replace({4:2, 5:2, 9:2}, inplace=True)
X_demographics_df.Q9_EmployedPremilitary.value_counts()
#1 = in school, 2=employed, 3=unemployed

2    453
1    186
3     98
Name: Q9_EmployedPremilitary, dtype: int64

In [20]:
# Q18 Education
#original coding 1=8th grade and less, 2=some HS, 3= HS grad, 4=some college, 5=college grad, 6=postgrad, (9,0 missing)
#will be 0: hs or less, 1: more than hs

X_demographics_df.Q18_PreMilEducation.replace({9999:0, 1:0, 2:0, 3:0, 4:1, 5:1, 6:1}, inplace=True)

In [21]:
# Q19 YearBorn
#2-digit year; (99,88 missing)

X_demographics_df.Q19_YearBorn.replace({88:np.nan}, inplace=True)
X_demographics_df.loc[:,'Age'] = X_demographics_df.loc[:,'Q19_YearBorn'].apply(lambda year: 85-year)
X_demographics_df.Age.fillna(X_demographics_df.Age.mean(), inplace=True)
X_demographics_df.drop(columns='Q19_YearBorn', inplace=True)


In [22]:
# Q20 social class
#original coding 1=middle class, 2=working class, (0,9,9999 Missing)
#will be 0=working class, 1=middle

X_demographics_df.Q20_SocialClass.replace({2:0, 9:0, 9999:0}, inplace=True)


In [23]:
# Q23 Marital status
#original coding 1=married, (2-5)=unmarried, (0,9999)=missing
#will be 0=not married, 1=married

X_demographics_df.Q23_MaritalStatus.replace({0:1, 9999:1}, inplace=True)
X_demographics_df.Q23_MaritalStatus.replace({2:0, 3:0,4:0,5:0}, inplace=True)

In [24]:
# Q27 Race
#original coding 4=white, (1,2,3,5,6)=other, 0=missing
#will be 0=white, 1=other

X_demographics_df.loc[:,'Q27_Race'] = \
X_demographics_df.loc[:,'Q27_Race'].apply(lambda num: 0 if num in [0,4] else 1)

In [25]:
# Q28 Income
#original coding , (1,2,3,4)=<30k, (5,6)=30k or more, 0=missing
#will be 0=less, 1=more

X_demographics_df.loc[:,'Q28_Income'] = \
X_demographics_df.loc[:,'Q28_Income'].apply(lambda num: 0 if num < 5 else 1)

# X Variables: Military Experiences

In [26]:
X_Military_df = data_df.loc[:,X_military]
display(X_Military_df.head())


Unnamed: 0,Q5_MonthsInVN,Q8_EnlistOrDraft,Q10_MilBranch,Q11_MilitaryRank,Q36a_VNBigEffect,PresentVNKilled,Q43_PresentWhenAmKilled,Q45_Killed,Q46_Wounded
7,21,1,3,5,1,1,1,1,1
8,99,2,2,7,2,2,9,2,9999
9,13,1,1,5,2,0,0,2,9999
11,14,2,1,5,1,1,1,1,1
12,12,1,4,4,2,2,2,2,2


In [27]:
# Q5 Months In Vietnam
#Q5_MonthsInVN = 99 means missing value according to codebook
X_Military_df["Q5_MonthsInVN"].replace({99:np.nan}, inplace=True)
X_Military_df['Q5_MonthsInVN'].fillna(X_Military_df['Q5_MonthsInVN'].mean(), inplace=True)

In [28]:
# Q8 Enlisted or Drafted
#Q8: 1 means enlisted, 2 means drafted, 3 means national guard-reserve, 0 is missing
# group national guard with enlisted; also fill the only missing(0) value with enlisted since it's the more common response
X_Military_df.Q8_EnlistOrDraft.replace({0:1, 3:1}, inplace=True)
X_Military_df.Q8_EnlistOrDraft.replace({2:0}, inplace=True)

In [29]:
#Q10 Military Branch
# Q10: 1=army, 2=navy, 3=marines, 4=air force, 5=coast guard, 6=other, 0,9999=missing
# group coast guard with navy; replace missing and other (6) with most common (1)
# will be 1=army, 0=other

X_Military_df.loc[:,'Q10_MilBranch'] = \
X_Military_df.loc[:,'Q10_MilBranch'].apply(lambda num: 0 if num != 1 else num)


In [30]:
# Military Rank
#0,9999 are missing
# will be 0=lower, 1=higher rank

X_Military_df.Q11_MilitaryRank.replace({0:4, 9999:4}, inplace=True)
X_Military_df.loc[:,'Q11_MilitaryRank']=\
X_Military_df.loc[:,'Q11_MilitaryRank'].apply(lambda num: 0 if num <5 else 1)


In [31]:
# Q36 Vietnam was biggest event of my life

#1 is yes, 2 is no (changing to 0), 9 missing
X_Military_df.loc[:,'Q36a_VNBigEffect']=\
X_Military_df.loc[:,'Q36a_VNBigEffect'].apply(lambda num: 0 if num > 1 else 1)


In [32]:
# Present when Vietnamese were Killed
# originally 1 is yes, 2 is no, (9,0 are missing)

X_Military_df.PresentVNKilled.replace({9:2,0:2}, inplace=True)
X_Military_df.PresentVNKilled.replace({2:0}, inplace=True)

In [33]:
# Present when americans were killed
# originally 1 is yes, 2 is no, (9,0,9999 are missing)
X_Military_df.Q43_PresentWhenAmKilled.replace({9:2,0:2,9999:2}, inplace=True)
X_Military_df.Q43_PresentWhenAmKilled.replace({2:0}, inplace=True)

In [34]:
# Q45 Killed
# 1=yes, 2=no, 9,0 missing

X_Military_df.Q45_Killed.replace({0:9}, inplace=True)
X_Military_df.loc[:,'Q45_Killed']=X_Military_df.loc[:,'Q45_Killed'].map({2:0,9:1,1:1})


In [35]:
# Q46 Wounded
# 1=yes, 2=no, 9,0 missing
X_Military_df.loc[:,'Q46_Wounded']=\
X_Military_df.loc[:,'Q46_Wounded'].apply(lambda num:1 if num==1 else 0)


In [36]:
for col in X_Military_df.iloc[:,1:]:
    display(X_Military_df[col].value_counts())

1    493
0    244
Name: Q8_EnlistOrDraft, dtype: int64

1    423
0    314
Name: Q10_MilBranch, dtype: int64

1    407
0    330
Name: Q11_MilitaryRank, dtype: int64

0    419
1    318
Name: Q36a_VNBigEffect, dtype: int64

1    437
0    300
Name: PresentVNKilled, dtype: int64

1    484
0    253
Name: Q43_PresentWhenAmKilled, dtype: int64

1    407
0    330
Name: Q45_Killed, dtype: int64

0    526
1    211
Name: Q46_Wounded, dtype: int64

# X Variables: Opinions About the War

In [37]:
XOpinionsdf = data_df.loc[:,X_opinions]
XOpinionsdf.shape


(737, 7)

In [38]:
# Justification Variables
#1=yes justified, 2=no, (0,9,9999 missing)
# will be 0=yes, 1 no, missing=0

cols = XOpinionsdf.columns.tolist()[:4]
for col in cols:
    #display(XOpinionsdf[col].value_counts())
    XOpinionsdf[col] = XOpinionsdf[col].map({2:1, 1:0, 0:0, 9:0, 9999:0})

# Manually Engineered Justification Score. 1 = believed the US was less justified in war behaviors
XOpinionsdf.loc[:, 'Justified_Score'] = XOpinionsdf[cols[0]]
for col in cols[1:]:
    XOpinionsdf['Justified_Score'] += XOpinionsdf[col]

XOpinionsdf['Justified_Score'] = XOpinionsdf['Justified_Score'].map({0:0, 1:0, 2:1, 3:1, 4:1})

display(XOpinionsdf.Justified_Score.value_counts())

0    476
1    261
Name: Justified_Score, dtype: int64

### PCA To Build Summary X-Var about Opinions of the US War Tactics (Justified or not?)

In [39]:
#Justified with PCA
    
pca = PCA(n_components=3)
j_pcomponents = pca.fit_transform(XOpinionsdf[cols])
    
#x_principalComponents = pca.fit_transform(X_df)
print(pca.explained_variance_ratio_)

principal_jDf = pd.DataFrame(data=j_pcomponents, columns=['j_pc_1', 'j_pc_2', 'j_pc_3'])
#display(principalDf)

print(np.percentile(principal_jDf.j_pc_1,[50]))
print(np.percentile(principal_jDf.j_pc_2,[50]))
print(np.percentile(principal_jDf.j_pc_3,[50]))


principal_jDf.loc[:,'j_pc_1']=principal_jDf.loc[:,'j_pc_1'].apply(lambda num:0 if num < -.1275 else 1)
principal_jDf.loc[:,'j_pc_2']=principal_jDf.loc[:,'j_pc_2'].apply(lambda num:0 if num < .043 else 1)
principal_jDf.loc[:,'j_pc_3']=principal_jDf.loc[:,'j_pc_3'].apply(lambda num:0 if num < .113 else 1)

principal_jDf

XOpinionsdf['j_pc_1'] = principal_jDf.j_pc_1.values
XOpinionsdf['j_pc_2'] = principal_jDf.j_pc_2.values
#XOpinionsdf['j_pc_3'] = principal_jDf.j_pc_3.values


[0.49533031 0.24787832 0.17230996]
[-0.12758645]
[0.04306894]
[0.11308515]


In [40]:
#KnowWhyWar - Cleaning
XOpinionsdf.loc[:,'Q39_KnowWhyWar'] = \
XOpinionsdf.loc[:,'Q39_KnowWhyWar'].apply(lambda num:0 if num==2 else 1)

In [41]:
# Q61 Worthwhile Cause

XOpinionsdf.loc[:,'Q61_WorthwhileCause'] = \
XOpinionsdf.loc[:,'Q61_WorthwhileCause'].apply(lambda num:0 if num==2 else 1)

In [42]:
# Q62 Trust US
# original 1=distrustful, 2=not
XOpinionsdf.loc[:,'Q62_TrustUSThen'] = \
XOpinionsdf.loc[:,'Q62_TrustUSThen'].apply(lambda num:1 if num==1 else 0)


# X Variables: Problems Upon Returning to Civilian Life

In [43]:
XProblemsdf = data_df.loc[:,X_problems]
display(XProblemsdf.head())
XProblemsdf.shape

Unnamed: 0,Q54a1_ProbJobThen,Q54b1_ProbHealthThen,Q54c1_ProbMoneyThen
7,1,2,1
8,2,2,2
9,2,2,1
11,1,2,1
12,1,2,2


(737, 3)

In [44]:
for col in XProblemsdf:
    #1 = problem, 2=not, 9999=missing; recode to 0=not, 1=problem
    XProblemsdf[col].replace({2:0, 9999:0}, inplace=True)


In [45]:
X_df = pd.DataFrame()
for df in [X_demographics_df, X_Military_df, XOpinionsdf, XProblemsdf]:
    print(df.shape)
    X_df = X_df.join(df, how='outer')
print(X_df.shape)

(737, 7)
(737, 9)
(737, 10)
(737, 3)
(737, 29)


In [46]:
X_df.head()
#for col in X_df:
    #display(X_df[col].value_counts())


Unnamed: 0,Q9_EmployedPremilitary,Q18_PreMilEducation,Q20_SocialClass,Q23_MaritalStatus,Q27_Race,Q28_Income,Age,Q5_MonthsInVN,Q8_EnlistOrDraft,Q10_MilBranch,...,Q38d_JustfiedDestruc,Q39_KnowWhyWar,Q61_WorthwhileCause,Q62_TrustUSThen,Justified_Score,j_pc_1,j_pc_2,Q54a1_ProbJobThen,Q54b1_ProbHealthThen,Q54c1_ProbMoneyThen
7,2,0,1,1,0,1,37.0,21.0,1,0,...,0,0,1,1,1,1,1,1,0,1
8,2,1,0,1,0,0,41.0,13.874317,0,0,...,0,1,0,1,0,0,1,0,0,0
9,3,0,0,1,0,0,34.0,13.0,1,1,...,0,1,1,0,0,1,1,0,0,1
11,2,0,0,1,1,0,38.0,14.0,0,1,...,1,1,0,1,1,1,0,1,0,1
12,2,1,1,1,0,1,38.0,12.0,1,0,...,1,1,1,1,1,1,0,1,0,0


In [47]:
#display(X_df.Age.describe())
X_df.loc[:, 'Age'] = X_df.loc[:, 'Age'].apply(lambda age:1 if age>=39 else 0)
display(X_df.Age.value_counts())


1    375
0    362
Name: Age, dtype: int64

In [48]:
X_df_dummies = pd.get_dummies(X_df, columns=['Q9_EmployedPremilitary', 'Q45_Killed'], drop_first=True)

df = X_df_dummies.join(y_df['MentalHealthScore']).join(y_df['MentalHealthScore2']).join(principalDf.y_principal_component)
df.shape

(737, 33)

In [49]:
# examine to get a rough estimate of which predictor variables aren't as useful
#df.corr().loc[:, 'y_principal_component']
#df.corr().loc[:, 'MentalHealthScore']
df.drop(columns=['Q39_KnowWhyWar', 'Q8_EnlistOrDraft', 'Q9_EmployedPremilitary_2', 'Q9_EmployedPremilitary_3', 
                 'Q27_Race', 'Q38b_JustifiedNapalm', 'Q38d_JustfiedDestruc', 'Q38a_JustifiedBomb', 'Justified_Score', 
                'Q38c_JustifiedChemicals', 'Q10_MilBranch', 'Q20_SocialClass', 'Q18_PreMilEducation',
                'Q23_MaritalStatus', 'Q11_MilitaryRank', 'Q5_MonthsInVN'], inplace=True)
corr = df.corr(method='spearman').loc[:, 'MentalHealthScore2']
corr.sort_values()
display(corr)

Q28_Income                -0.174143
Age                       -0.197338
Q36a_VNBigEffect           0.181337
PresentVNKilled            0.290741
Q43_PresentWhenAmKilled    0.280238
Q46_Wounded                0.215261
Q61_WorthwhileCause       -0.145949
Q62_TrustUSThen            0.177844
j_pc_1                     0.195413
j_pc_2                    -0.070091
Q54a1_ProbJobThen          0.272476
Q54b1_ProbHealthThen       0.203385
Q54c1_ProbMoneyThen        0.299186
Q45_Killed_1               0.235084
MentalHealthScore          0.727965
MentalHealthScore2         1.000000
y_principal_component      0.018294
Name: MentalHealthScore2, dtype: float64

# Testing some models

In [50]:
seed=6

models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier(max_leaf_nodes=25)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('Perceptron', Perceptron(max_iter=10)))

# Can try excluding or including other variables too
X = df.drop(columns=['MentalHealthScore', 'MentalHealthScore2','y_principal_component','j_pc_2'])

y = df.MentalHealthScore2
#y = y_df.MentalHealthScore
#y = principalDf.y_principal_component

display(X.head())

results = []
names = []
scoring = 'accuracy'

for name, model in models:
    stratkfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed)
    y_pred = model_selection.cross_val_predict(model, X, y, cv=stratkfold)
    cm = confusion_matrix(y, y_pred)
    print('model:', name)
    print('accuracy:', accuracy_score(y ,y_pred), "\n")
    #print('classification report:\n', classification_report(y, y_pred))

    #cv_results = model_selection.cross_val_score(model, X, y)
    #results.append(cv_results)
    #names.append(name)
    #msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #print(msg,"\n")


Unnamed: 0,Q28_Income,Age,Q36a_VNBigEffect,PresentVNKilled,Q43_PresentWhenAmKilled,Q46_Wounded,Q61_WorthwhileCause,Q62_TrustUSThen,j_pc_1,Q54a1_ProbJobThen,Q54b1_ProbHealthThen,Q54c1_ProbMoneyThen,Q45_Killed_1
7,1,0,1,1,1,1,1,1,1,1,0,1,1
8,0,1,0,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,1,0,1,0,0,1,0
11,0,0,1,1,1,1,0,1,1,1,0,1,1
12,1,0,0,0,0,0,1,1,1,1,0,0,0


model: Logistic Regression
accuracy: 0.7245590230664858 

model: CART
accuracy: 0.6770691994572592 

model: NB
accuracy: 0.689280868385346 

model: SVM
accuracy: 0.7272727272727273 

model: Perceptron
accuracy: 0.6445047489823609 



Sources

https://stackoverflow.com/questions/14676265/how-to-read-text-file-into-a-list-or-array-with-python

https://stackoverflow.com/questions/4978787/how-to-split-a-string-into-array-of-characters

https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas

https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python

https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columns

https://stackoverflow.com/questions/38134012/pandas-dataframe-fillna-only-some-columns-in-place

https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict

https://stats.stackexchange.com/questions/82050/principal-component-analysis-and-regression-in-python

https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60