# Data Cleaning for SET Data

Date: 07/03/2019

## Unit and Data Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Import data set
cleaned_filepath = 'SET data archive 3416 cases recd thru end of 03 2019.csv'

df = pd.read_csv(cleaned_filepath)
df.head()

Unnamed: 0,MERGE_DATE,archive_upload_2229,FULL_GRYD_ID_FB,Processing_status,Manual_notes_ETO,no_intake_question,no_retest1_question,no_retest2_question,FEEDBACK_STATUS,date_feedback_uploaded_ETO,...,othgpnorms_4bins,othgpcohe_8bins,othgpID_8bins,othgpemo_8bins,LN_CRIME,LN_VIOLENCE,SETDirections_ETO,SiteName_ETO,PROGRAM_NAME_ETO,enroll_ETO_num
0,2/8/2019,,77th 2-13-1494-1,2.0,"Missing response(s) S11a text, s11b, x10 and x15",,,,2.0,2/8/2019,...,1,3,5,5,0.0,0.0,If individual is still receiving services in 6...,1.0,2.0,1
1,1/16/2015,,RPARSA-2-64-1,9.0,,,,,0.0,,...,1,4,5,3,0.0,1.6094379124341,,,,1
2,4/28/2014,,RPARSA-2-77-1,9.0,,,,,0.0,,...,9,9,9,9,1.79175946922805,0.693147180559945,,1.0,,1
3,3/18/2019,,Northeast-25-68-2,1.0,,,,,2.0,3/18/2019,...,9,9,9,9,1.09861228866811,1.38629436111989,If individual is still receiving services in 6...,1.0,14.0,1
4,3/18/2019,,Northeast-25-69-2,1.0,,,,,2.0,3/18/2019,...,9,9,9,9,0.693147180559945,0.693147180559945,If individual is still receiving services in 6...,1.0,14.0,1


In [8]:
df['AGE_1_FB'].value_counts()

18     586
17     580
16     463
19     327
15     304
20     211
21     161
23     136
14     134
22     134
25     105
24     102
26      42
13      27
27      22
28      15
30      14
32      11
29       7
12       6
31       6
33       6
35       4
36       3
999      3
34       2
42       2
41       2
43       1
Name: AGE_1_FB, dtype: int64

In [281]:
df['no_fam_num'].value_counts()

1    2020
1    1350
0      23
0      18
        5
Name: no_fam_num, dtype: int64

In [282]:
df.shape

(3416, 432)

## Investigating Data

### Drop the wrong version 

In [283]:
df = df[df['SET_version_num'] == 3]
df['SET_version_num'].value_counts()

3    2800
Name: SET_version_num, dtype: int64

In [284]:
df.head()

Unnamed: 0,MERGE_DATE,archive_upload_2229,FULL_GRYD_ID_FB,Processing_status,Manual_notes_ETO,no_intake_question,no_retest1_question,no_retest2_question,FEEDBACK_STATUS,date_feedback_uploaded_ETO,...,othgpnorms_4bins,othgpcohe_8bins,othgpID_8bins,othgpemo_8bins,LN_CRIME,LN_VIOLENCE,SETDirections_ETO,SiteName_ETO,PROGRAM_NAME_ETO,enroll_ETO_num
0,2/8/2019,,77th 2-13-1494-1,2.0,"Missing response(s) S11a text, s11b, x10 and x15",,,,2.0,2/8/2019,...,1,3,5,5,0.0,0.0,If individual is still receiving services in 6...,1,2,1
3,3/18/2019,,Northeast-25-68-2,1.0,,,,,2.0,3/18/2019,...,9,9,9,9,1.09861228866811,1.38629436111989,If individual is still receiving services in 6...,1,14,1
4,3/18/2019,,Northeast-25-69-2,1.0,,,,,2.0,3/18/2019,...,9,9,9,9,0.693147180559945,0.693147180559945,If individual is still receiving services in 6...,1,14,1
5,3/18/2019,,Northeast-25-70-2,1.0,,,,,2.0,3/18/2019,...,9,9,9,9,1.09861228866811,0.693147180559945,If individual is still receiving services in 6...,1,14,1
6,4/14/2017,1.0,Olympic-12-39-2,3.0,ETO continues to show duplicate GRYD IDs - the...,2.0,,,2.0,6/8/2018,...,1,5,2,3,1.38629436111989,1.6094379124341,If individual is still receiving services in 6...,1,17,2


### Creating Unique ID

In [285]:
#showing how many ETO_IDs are missing

df['ETO_ID_FB'] = pd.to_numeric(df['ETO_ID_FB'], downcast = 'integer', errors = 'coerce')
df['ETO_ID_FB'].isnull().value_counts()

#None of them are missing

False    2800
Name: ETO_ID_FB, dtype: int64

In [286]:
# Change the column name to UniqueID

In [287]:
df.rename(columns={'ETO_ID_FB':'UniqueID'}, inplace = True)

### Column Processing_status

In [288]:
#Drop incomplete processing status
df = df[df['Processing_status'] < 2]

In [289]:
df.head()
df['Processing_status'].value_counts()

1.5    1646
1.0    1123
Name: Processing_status, dtype: int64

## Questionnaire part

### Family Section (clean data for no family people )

In [300]:
#list the questions about family
f_que = df.loc[:,'FS90_younger':'FS90_older_adults'].columns.tolist()
f_que.extend(df.loc[:,'FS90_younger':'FS90_older_adults'].columns.tolist())

#list the index for those who don't have family
df['no_fam_num'] = pd.to_numeric(df['no_fam_num'], downcast = 'integer', errors = 'coerce')
NoFamppl = df[df['no_fam_num'] == 0].index.tolist()
NoFamppl.extend(df[df['no_fam_num'].isnull()].index.tolist())
#NoFamppl.extend(df[df['no_fam_num'] == 'System'].index.tolist())
print(NoFamppl)

[54, 97, 99, 243, 604, 606, 607, 626, 938, 1357, 1452, 1454, 1469, 1499, 1503, 1512, 1515, 1523, 1537, 1782, 1796, 2032, 2033, 2217, 2309, 2313, 2327, 2413, 2769, 2812, 2881, 2903, 2909, 2946, 3124, 3230, 3231, 3233, 3234, 3328, 3329, 225, 1147, 1183, 1619, 1945]


In [309]:
df['no_group_num'].value_counts()

1.0    1711
0.0    1534
Name: no_group_num, dtype: int64

In [228]:
for col in f_que:
    df[col] = pd.to_numeric(df[col], downcast = 'integer', errors = 'coerce')

In [229]:
#change their resposnes from f1 to f3 to 0
for index in NoFamppl:
    for col in f_que:
        df.loc[index, col] = 0

In [230]:
f_que.extend(df.loc[:,'F4':'FS98'].columns.tolist())
f_que.extend(df.loc[:,'FS100':'F17x'].columns.tolist())
f_que.extend(df.loc[:,'F18':'F31b'].columns.tolist())

In [231]:
for col in f_que:
    df[col] = pd.to_numeric(df[col], downcast = 'integer', errors = 'coerce')

In [232]:
#change their resposnes from f4 to f31b to nan
for index in NoFamppl:
    for col in f_que:
        df.loc[index, col] = np.nan

### Group Section (clean data for no group people )

In [310]:
#list the index for those who don't have group

df['no_group_num'] = pd.to_numeric(df['no_group_num'], downcast = 'integer', errors = 'coerce')
NoGrppl.extend(df[df['no_group_num'] == 0].index.tolist())
NoGrppl.extend(df[df['no_group_num'].isnull()].index.tolist())

#list the group questions
g_que = df.loc[:,'G2':'G9'].columns.tolist()
g_que.extend(df.loc[:,'G10x':'G23x'].columns.tolist())
g_que.extend(df.loc[:,'G24':'G37'].columns.tolist())

In [311]:
df['no_group_num'].value_counts()

1.0    1711
0.0    1534
Name: no_group_num, dtype: int64

In [312]:
for col in g_que:
    df[col] = pd.to_numeric(df[col], downcast = 'integer', errors = 'coerce')

In [315]:
#change their resposnes from G2 to G37 to nan
for index in NoGrppl:
    for col in g_que:
        df.loc[index, col] = np.nan

### Getting rid of Nah

In [319]:
# S section
que = df.loc[:,'S1':'S3'].columns.tolist()
que.extend(df.loc[:,'S4x':'S6ax'].columns.tolist())
que.extend(df.loc[:,'S9':'S10'].columns.tolist())
que.extend(df.loc[:,'S10x':'S11a'].columns.tolist())
que.extend(df.loc[:,'S11b':'S16'].columns.tolist())

# ME,P,C section
que.extend(df.loc[:,'ME17':'C5b'].columns.tolist())

# FS section
que.extend(df.loc[:,'FS89_foster_now':'FS90_older_adults'].columns.tolist())

# F section
que.extend(f_que)

# G section
que.extend(g_que)

# X section
que.extend(df.loc[:,'X18':'X22'].columns.tolist())

In [320]:
print(que)

['S1', 'S2', 'S3', 'S4x', 'S5x', 'S7x', 'S6ax', 'S9', 'S10', 'S10x', 'S10z', 'S11a', 'S11b', 'S12', 'S13', 'S14', 'S16', 'ME17', 'ME18', 'ME19', 'ME20a', 'ME20b', 'ME20x', 'ME22_ER', 'P22', 'P23', 'P24', 'P25', 'P26', 'C1a', 'C1b', 'C2a', 'C2b', 'C3a', 'C3b', 'C4a', 'C4b', 'C5a', 'C5b', 'FS89_foster_now', 'FS89_foster_past', 'FS89_group_now', 'FS89_group_past', 'FS89_adopted_now', 'FS89_adopted_past', 'FS89_parents_now', 'FS89_parents_past', 'FS89_relatives_now', 'FS89_relatives_past', 'FS89_spouse_now', 'FS89_spouse_past', 'FS89_boygirlfriend_now', 'FS89_boygirlfriend_past', 'FS89_alone_now', 'FS89_alone_past', 'FS90_younger', 'FS90_teenagers', 'FS90_20_29_adults', 'FS90_30_49_adults', 'FS90_older_adults', 'FS90_younger', 'FS90_teenagers', 'FS90_20_29_adults', 'FS90_30_49_adults', 'FS90_older_adults', 'FS90_younger', 'FS90_teenagers', 'FS90_20_29_adults', 'FS90_30_49_adults', 'FS90_older_adults', 'G2', 'G3a', 'G3b', 'EG1', 'EG2_most', 'EG2_youngest', 'EG2_oldest', 'EG3', 'EG4x', 'G5',

In [321]:
for col in que:
    df[col].replace('System',np.NaN)  
    df[col] = pd.to_numeric(df[col], downcast = 'integer', errors = 'coerce')
    df.loc[(df[col] == 999), col] = np.nan
    df.loc[(df[col] == 888), col] = np.nan
    df.loc[(df[col] == 777), col] = np.nan
    df.loc[(df[col] == 555), col] = np.nan

In [187]:
df.loc[[54]][f_que]

Unnamed: 0,F1,F2,F3,IF_NO_FAMILY_TEXT,no_fam_num,F4,F5,FS91,FS92,FS93,...,F22x,F23x,F24x,F25x,F26,F27,F28x,F29,F31x,F31b
54,,,,,,,,,,,...,,,,,,,,,,


In [128]:
df['FS90_teenagers'].value_counts()

1.0     1088
2.0      737
0.0      417
3.0      219
4.0       62
5.0       29
6.0       11
7.0        9
10.0       9
18.0       3
9.0        2
14.0       2
8.0        2
12.0       2
50.0       1
17.0       1
4.5        1
13.0       1
19.0       1
16.0       1
15.0       1
11.0       1
36.0       1
Name: FS90_teenagers, dtype: int64