In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [2]:
import sys
sys.getdefaultencoding()

'utf-8'

Open the questionnaire file (word document version of the questionnaire saved as html):

In [3]:
questionnaire_file = open("FF_f2_Questionnaire.htm", "r") 

In [4]:
soup=BeautifulSoup(questionnaire_file, "html.parser")

In [5]:
ps=[p for p in soup.find_all('p')]

Opening the excel file with all the questions for p6, using the pandas python library:

In [6]:
df=pd.read_excel("FFCf2.xlsx") #this is an excel file with all the questions for p6

In [7]:
df.head()

Unnamed: 0,position,name,varlab
0,863,f2a3,How much of the time does the child live with ...
1,864,f2a3b,Have you ever lived with child?
2,865,f2a3c,"Since child's birth, has he/she ever stayed ov..."
3,866,f2a3d,How many nights has child spent with you since...
4,867,f2a4,How many months ago did (he/she) stop living w...


In [8]:
df.shape

(796, 3)

In [9]:
reg = re.compile(r'(?P<section>[A-Za-z]*\d*)(?P<rest>.*)')

def format_q_num(name):
    """takes in the question numbers in the name column and outputs them in the format
    they appear in the text"""
    name=name.replace('f2','',1)
    name=reg.search(name)
    section=name.group('section').upper()
    rest=name.group('rest').upper()
    return section+rest

In [10]:
df['question_number']=df.name.apply(lambda x: format_q_num(x))

In [11]:
df['multiple_answers']=df.question_number.apply(lambda x: 1 if re.match('[A-Z]\d*[A-Z]\d*[A-Z]',x) else 0) 
#adds a column where 1 indicates that it is one of the questions with multiple entries in the data frame
#I think these are mostly (only?) the "select all that apply" questions

In [12]:
df[60:70]

Unnamed: 0,position,name,varlab,question_number,multiple_answers
60,923,f2b5b1,How much did child weigh on that day?-(Pounds),B5B1,0
61,924,f2b5b2,How much did child weigh on that day?-(Ounces),B5B2,0
62,925,f2b6,# of times since birth child been to health ca...,B6,0
63,926,f2b7,How many times since birth has child been to h...,B7,0
64,927,fx2b7,TCO-How many times has child beem seen by a he...,FX2B7,1
65,928,f2b7a,How many times since birth has child been to h...,B7A,0
66,929,f2b8,How many times since birth has child been to e...,B8,0
67,930,f2b8a,How many of the visits to emergency room for a...,B8A,0
68,931,f2b9,"Since leaving the hospital at birth, has child...",B9,0
69,932,f2b10,"Since child was born, how many times have they...",B10,0


In [13]:
def find_question(q_number):
    """finds where the question number appears in the text - only if it only appears once"""
    ps_text = []
    ps_idx = []
    ps_startswith = []
    for idx, p in enumerate(ps):
        if q_number+"." in p.get_text():
            if p.get_text().lstrip().startswith(q_number+"."):
                ps_text.append(p.get_text().lstrip())
                ps_idx.append(idx)
            else:
                start = p.get_text().find(q_number+'.')
                ps_text.append(p.get_text().lstrip()[start:])
                ps_idx.append(idx)
    if len(ps_idx)!=1:
        return np.nan, np.nan
    else:
        return ps_text[0].replace("\n"," ").strip(), ps_idx[0]

In [14]:
b6 = ps[541:550]
a3c = ps[149:153]
a3b = ps[136:140]

In [15]:
find_question('A3C')

('Since (CHILD’s) birth, has (he/she) ever stayed overnight with you?', 148)

In [16]:
find_question('B6')

('Since (CHILD) was born, approximately how many times has (he/she) been seen by a doctor, nurse, or other health care professional for a regular checkup or “well‑baby visit”?\xa0 Would you say . . .',
 541)

In [17]:
b6[0].get_text().lstrip()

'NMIHS\n\n\n\n\n\n\xa0\n\n\nB6.\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Since (CHILD)\nwas born, approximately how many times has (he/she) been seen by a doctor,\nnurse, or other health care professional for a regular checkup or “well‑baby visit”?\xa0 Would you say . . . '

In [18]:
a3b[0].get_text().lstrip().startswith('A3B')

True

In [19]:
def find_probe(q_number):
    probe = re.findall('[A-Z]{,1}\d{,2}', q_number)[0]
    return find_question(probe)[0]

In [20]:
find_probe('B6')

'Since (CHILD) was born, approximately how many times has (he/she) been seen by a doctor, nurse, or other health care professional for a regular checkup or “well‑baby visit”?\xa0 Would you say . . .'

In [21]:
df

Unnamed: 0,position,name,varlab,question_number,multiple_answers
0,863,f2a3,How much of the time does the child live with ...,A3,0
1,864,f2a3b,Have you ever lived with child?,A3B,0
2,865,f2a3c,"Since child's birth, has he/she ever stayed ov...",A3C,0
3,866,f2a3d,How many nights has child spent with you since...,A3D,0
4,867,f2a4,How many months ago did (he/she) stop living w...,A4,0
5,868,f2a4a,Who does child (usually) live with?,A4A,0
6,869,f2a4b,About how long has the child lived there? (mo...,A4B,0
7,870,f2a4c,Do you expect the child to live with you (agai...,A4C,0
8,871,f2a4d,How many days did you see the child in the pas...,A4D,0
9,872,f2a5,What was your relationship with mother when ch...,A5,0


In [22]:
for i in range(len(df.question_number)):
    q_number = df.loc[i,'question_number']
    if re.match('^[A-Z]{,1}\d{,2}', q_number):
        df.loc[i,'probe'] = find_probe(q_number)
    else:
        df.loc[i,'probe'] = "None"

In [23]:
df

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe
0,863,f2a3,How much of the time does the child live with ...,A3,0,A3. How much of the time does (CHILD...
1,864,f2a3b,Have you ever lived with child?,A3B,0,A3. How much of the time does (CHILD...
2,865,f2a3c,"Since child's birth, has he/she ever stayed ov...",A3C,0,A3. How much of the time does (CHILD...
3,866,f2a3d,How many nights has child spent with you since...,A3D,0,A3. How much of the time does (CHILD...
4,867,f2a4,How many months ago did (he/she) stop living w...,A4,0,A4. How many months ago did (he/she)...
5,868,f2a4a,Who does child (usually) live with?,A4A,0,A4. How many months ago did (he/she)...
6,869,f2a4b,About how long has the child lived there? (mo...,A4B,0,A4. How many months ago did (he/she)...
7,870,f2a4c,Do you expect the child to live with you (agai...,A4C,0,A4. How many months ago did (he/she)...
8,871,f2a4d,How many days did you see the child in the pas...,A4D,0,A4. How many months ago did (he/she)...
9,872,f2a5,What was your relationship with mother when ch...,A5,0,"A5. Next, I have a few questions abo..."


In [24]:
df['ps_text'], df['ps_idx'] = zip(*df['question_number'].map(find_question))

In [25]:
df

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe,ps_text,ps_idx
0,863,f2a3,How much of the time does the child live with ...,A3,0,A3. How much of the time does (CHILD...,A3. How much of the time does (CHILD...,123.0
1,864,f2a3b,Have you ever lived with child?,A3B,0,A3. How much of the time does (CHILD...,A3B. Have you ever lived with (CHILD)?,136.0
2,865,f2a3c,"Since child's birth, has he/she ever stayed ov...",A3C,0,A3. How much of the time does (CHILD...,"Since (CHILD’s) birth, has (he/she) ever staye...",148.0
3,866,f2a3d,How many nights has child spent with you since...,A3D,0,A3. How much of the time does (CHILD...,How many nights altogether has (CHILD) spent w...,156.0
4,867,f2a4,How many months ago did (he/she) stop living w...,A4,0,A4. How many months ago did (he/she)...,A4. How many months ago did (he/she)...,163.0
5,868,f2a4a,Who does child (usually) live with?,A4A,0,A4. How many months ago did (he/she)...,A4A. Who does (CHILD) (usually) live with?,174.0
6,869,f2a4b,About how long has the child lived there? (mo...,A4B,0,A4. How many months ago did (he/she)...,A4B. About how long has (CHILD) been li...,189.0
7,870,f2a4c,Do you expect the child to live with you (agai...,A4C,0,A4. How many months ago did (he/she)...,A4C. Do you expect (CHILD) to live with...,198.0
8,871,f2a4d,How many days did you see the child in the pas...,A4D,0,A4. How many months ago did (he/she)...,A4D. About how many days did you see (C...,204.0
9,872,f2a5,What was your relationship with mother when ch...,A5,0,"A5. Next, I have a few questions abo...","A5. Next, I have a few questions abo...",210.0


In [26]:
np.isnan(df.ps_idx).sum() #number of questions where the question number doesn't appear or appears more than once

341

In [27]:
#get index of <p> tag for the next question so we can split the questions (can't just shift because some questions have more than one answer or missing values)
all_idx=np.array(df.ps_idx)
all_idx=all_idx[~np.isnan(all_idx)]
all_idx=np.append(all_idx, [len(ps)])
df['ps_idx_next']=df.ps_idx.apply(lambda x: all_idx[all_idx > x].min() if ~np.isnan(x) else np.nan)

In [28]:
consequitivedots = re.compile(r'\.{3,}')

In [29]:
def clean_text(string):
    return string.replace("\n"," ").strip()

def get_pairs(p):
    if re.match('^[a-z]{1}', p.get_text().split('..')[0]):
        if re.match('[()]{1}', p.get_text().split('..')[-1]):
            pairs = [p.find_previous('p').get_text() + ' ' + p.get_text().split('..')[0], p.get_text().split("(")[0].split('..')[-1]]
        else:
            pairs = [p.find_previous('p').get_text() + ' ' + p.get_text().split('..')[0], p.get_text().split('..')[-1]]
    else:
        if re.match('[()]{1}', p.get_text().split('.')[-1]):
            pairs = [p.get_text().split('..')[0], p.get_text().split("(")[0].split('..')[-1] ]
        else:
            pairs = [p.get_text().split('..')[0], p.get_text().split('..')[-1]]
    pairs = [clean_text(text) for text in pairs]
    return pairs
        
def get_answers(start_idx,stop_idx):
    if np.isnan(start_idx) or np.isnan(stop_idx):
        return ([],0)
    else:
        answers = [p for p in ps[int(start_idx):int(stop_idx)] if len(p.find_all(string = consequitivedots))>0]
        answers = [get_pairs(a) for a in answers]
        return(answers, len(answers)) 
        

In [30]:
df['answers'], df['n_answers'] = zip(*map(get_answers, df['ps_idx'], df['ps_idx_next']))

In [31]:
(df['answers'][20])

[['YES', '. 1'], ['NO', '. 2 è GO TO A7C']]

In [32]:
x = 0
while x < len(df.answers):
    ans = df.answers[x]
    for i in range(len(ans)): #for each answer choice,
        s = ans[i][1]        # store number string
        a = re.sub('è',' --> ', s) # sub the arrow in for skip pattern
        t = (ans[i][0],a) # create a new tuple with arrow
        ans[i] = t
    for y in range(len(ans)):
        df.loc[x,'val{0}'.format(y)] = ans[y][1] #create new 'val(y)' column
        df.loc[x,'lab{0}'.format(y)] = ans[y][0] #create new 'lab(y)' column
        
    x += 1

In [33]:
def clean_answers(col):
    col.str.replace('.','',1)

In [34]:
values = []
for i in range(0,18):
    values = values + ['val{0}'.format(i)]
    i += 1
for v in values:
    df[v] = df[v].map(lambda x: str(x))
    df[v] = df[v].map(lambda x: x.lstrip('.').strip())

In [35]:
df['val0']

0       1  -->  GO TO A5
1       1  -->  GO TO A4
2                      1
3                    nan
4                    -10
5                      1
6                      0
7                      1
8                    nan
9                      1
10                     1
11                     1
12                     1
13     1  -->  GO TO A7A
14                     0
15                   nan
16                     1
17                     1
18                   nan
19                   nan
20                     1
21                   nan
22                   nan
23                     1
24                   nan
25                   nan
26                     1
27                     1
28                     1
29                   nan
             ...        
766                  nan
767                  nan
768                  nan
769                  nan
770                  nan
771                  nan
772                   -2
773                    1
774                    1


In [36]:
i = 0
while i < len(df.ps_text):
    if df.multiple_answers[i] == 1:
        df.loc[i,'val0'] = '1'
        df.loc[i,'lab0'] = 'YES'
        df.loc[i,'val1'] = '2'
        df.loc[i,'lab1'] = 'NO'
    i += 1

In [37]:
df.drop(['answers','n_answers','question_number','ps_idx','ps_idx_next'],axis=1,inplace=True)

In [38]:
df.head(20)

Unnamed: 0,position,name,varlab,multiple_answers,probe,ps_text,val0,lab0,val1,lab1,...,val90,lab90,val91,lab91,val92,lab92,val93,lab93,val94,lab94
0,863,f2a3,How much of the time does the child live with ...,0,A3. How much of the time does (CHILD...,A3. How much of the time does (CHILD...,1 --> GO TO A5,"All or most of the time,",2 --> GO TO A5,"About half of the time,",...,,,,,,,,,,
1,864,f2a3b,Have you ever lived with child?,0,A3. How much of the time does (CHILD...,A3B. Have you ever lived with (CHILD)?,1 --> GO TO A4,YES,2,NO,...,,,,,,,,,,
2,865,f2a3c,"Since child's birth, has he/she ever stayed ov...",0,A3. How much of the time does (CHILD...,"Since (CHILD’s) birth, has (he/she) ever staye...",1,YES,2 --> GO TO A4A,NO,...,,,,,,,,,,
3,866,f2a3d,How many nights has child spent with you since...,0,A3. How much of the time does (CHILD...,How many nights altogether has (CHILD) spent w...,,,,,...,,,,,,,,,,
4,867,f2a4,How many months ago did (he/she) stop living w...,0,A4. How many months ago did (he/she)...,A4. How many months ago did (he/she)...,-10,(CHILD) MOST OR ALL OF THE TIME,,,...,,,,,,,,,,
5,868,f2a4a,Who does child (usually) live with?,0,A4. How many months ago did (he/she)...,A4A. Who does (CHILD) (usually) live with?,1,BIOLOGICAL MOTHER,2,MATERNAL GRANDPARENT(S),...,,,,,,,,,,
6,869,f2a4b,About how long has the child lived there? (mo...,0,A4. How many months ago did (he/she)...,A4B. About how long has (CHILD) been li...,0,LESS THAN ONE MONTH,,,...,,,,,,,,,,
7,870,f2a4c,Do you expect the child to live with you (agai...,0,A4. How many months ago did (he/she)...,A4C. Do you expect (CHILD) to live with...,1,YES,2,NO,...,,,,,,,,,,
8,871,f2a4d,How many days did you see the child in the pas...,0,A4. How many months ago did (he/she)...,A4D. About how many days did you see (C...,,,,,...,,,,,,,,,,
9,872,f2a5,What was your relationship with mother when ch...,0,"A5. Next, I have a few questions abo...","A5. Next, I have a few questions abo...",1,"Married,",2,"Romantically involved,",...,,,,,,,,,,


In [39]:
df.to_csv(path_or_buf = "f2Qtext.csv")