In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [2]:
import sys
sys.getdefaultencoding()

'utf-8'

Open the questionnaire file (word document version of the questionnaire saved as html):

In [3]:
questionnaire_file = open("FF_m3_Questionnaire.htm", "r") 

In [4]:
soup=BeautifulSoup(questionnaire_file, "html.parser")

In [5]:
ps=[p for p in soup.find_all('p')]

In [6]:
ps[108:111]

[<p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             MATERNAL GRANDPARENT(S)...................... 2
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             PATERNAL GRANDPARENT(S)...................... 3
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             OTHER RELATIVE(S)......................................... 4
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> 

Opening the excel file with all the questions for p6, using the pandas python library:

In [7]:
df=pd.read_excel("FFCm3.xlsx") #this is an excel file with all the questions for p6

In [8]:
df.head()

Unnamed: 0,position,name,varlab
0,3916,m3a2,How much of the time does the child live with ...
1,3917,m3a3,How many months ago did he/she stop living wit...
2,3918,m3a3a,Who does child usually live with?
3,3919,m3a3b,Are child's foster parents related to you?
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...


In [9]:
df.shape

(1035, 3)

In [10]:
reg = re.compile(r'(?P<section>[A-Za-z]*\d*)(?P<rest>.*)')

def format_q_num(name):
    """takes in the question numbers in the name column and outputs them in the format
    they appear in the text"""
    name=name.lstrip('m3').split('_')[0] 
    name=reg.search(name)
    section=name.group('section').upper()
    rest=name.group('rest').upper()
    return section+rest

In [11]:
df['question_number']=df.name.apply(lambda x: format_q_num(x))

In [12]:
df['multiple_answers']=df.name.apply(lambda x: 1 if "_" in x else 0) 
#adds a column where 1 indicates that it is one of the questions with multiple entries in the data frame
#I think these are mostly (only?) the "select all that apply" questions

In [13]:
df.head(20)

Unnamed: 0,position,name,varlab,question_number,multiple_answers
0,3916,m3a2,How much of the time does the child live with ...,A2,0
1,3917,m3a3,How many months ago did he/she stop living wit...,A3,0
2,3918,m3a3a,Who does child usually live with?,A3A,0
3,3919,m3a3b,Are child's foster parents related to you?,A3B,0
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...,A3B1,0
5,3921,m3a3c,About how many months has child been living th...,A3C,0
6,3922,m3a3d,Do you expect child to live with you again dur...,A3D,0
7,3923,m3a3e,About how many days did you see child in past ...,A3E,0
8,3924,m3a3f,Who does child live with when he/she is not li...,A3F,0
9,3925,m3a3g,How many days did child live with you out of t...,A3G,0


In [14]:
def find_question(q_number):
    """finds where the question number appears in the text - only if it only appears once"""
    ps_text = []
    ps_idx = []
    ps_startswith = []
    for idx, p in enumerate(ps):
        if q_number+"." in p.get_text():
            if p.get_text().lstrip().startswith(q_number+"."):
                ps_text.append(p.get_text().lstrip())
                ps_idx.append(idx)
    if len(ps_idx)!=1:
        return np.nan, np.nan
    else:
        return ps_text[0].replace("\n"," ").strip(), ps_idx[0]

In [15]:
def find_probe(q_number):
    probe = re.findall('[A-Z]{,1}\d{,2}', q_number)[0]
    return find_question(probe)[0]

In [16]:
for i in range(len(df.question_number)):
    q_number = df.loc[i,'question_number']
    if re.match('^[A-Z]{,1}\d{,2}', q_number):
        df.loc[i,'probe'] = find_probe(q_number)
    else:
        df.loc[i,'probe'] = "None"

In [17]:
df['ps_text'], df['ps_idx'] = zip(*df['question_number'].map(find_question))

In [18]:
df

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe,ps_text,ps_idx
0,3916,m3a2,How much of the time does the child live with ...,A2,0,A2. How much of the time does (CHILD...,A2. How much of the time does (CHILD...,79.0
1,3917,m3a3,How many months ago did he/she stop living wit...,A3,0,A3. How many months ago did (he/she)...,A3. How many months ago did (he/she)...,95.0
2,3918,m3a3a,Who does child usually live with?,A3A,0,A3. How many months ago did (he/she)...,A3A. Who does (CHILD) (usually) live with?,105.0
3,3919,m3a3b,Are child's foster parents related to you?,A3B,0,A3. How many months ago did (he/she)...,A3B. Are (CHILD's) foster parents relat...,119.0
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...,A3B1,0,A3. How many months ago did (he/she)...,A3B1. Does (PERSON IN A3A) receive any ki...,128.0
5,3921,m3a3c,About how many months has child been living th...,A3C,0,A3. How many months ago did (he/she)...,A3C. About how many months has (CHILD) ...,136.0
6,3922,m3a3d,Do you expect child to live with you again dur...,A3D,0,A3. How many months ago did (he/she)...,A3D. Do you expect (CHILD) to live with...,143.0
7,3923,m3a3e,About how many days did you see child in past ...,A3E,0,A3. How many months ago did (he/she)...,A3E. About how many days did you see (C...,149.0
8,3924,m3a3f,Who does child live with when he/she is not li...,A3F,0,A3. How many months ago did (he/she)...,A3F. Who does (CHILD) live with when (h...,156.0
9,3925,m3a3g,How many days did child live with you out of t...,A3G,0,A3. How many months ago did (he/she)...,A3G. How many days did (CHILD) live wit...,166.0


In [19]:
np.isnan(df.ps_idx).sum() #number of questions where the question number doesn't appear or appears more than once

338

In [20]:
print(df.ps_text[11])
print(df.ps_idx[11])

A4A.        Would you say you are romantically involved on a steady basis, or are you in an on-again - off-again relationship?
190.0


In [21]:
#get index of <p> tag for the next question so we can split the questions (can't just shift because some questions have more than one answer or missing values)
all_idx=np.array(df.ps_idx)
all_idx=all_idx[~np.isnan(all_idx)]
all_idx=np.append(all_idx, [len(ps)])
df['ps_idx_next']=df.ps_idx.apply(lambda x: all_idx[all_idx > x].min() if ~np.isnan(x) else np.nan)

In [22]:
a3 = ps[107:115]

In [23]:
a3

[<p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             BIOLOGICAL FATHER....................................... 1<b>
 </b></span><span style="font-family:Wingdings;color:black;layout-grid-mode:
 line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3C</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             MATERNAL GRANDPARENT(S)...................... 2
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             PATERNAL GRANDPARENT(S)...................... 3
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:l

In [24]:
for p in a3:
    print(p)

<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             BIOLOGICAL FATHER....................................... 1<b>
</b></span><span style="font-family:Wingdings;color:black;layout-grid-mode:
line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3C</b></span></p>
<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             MATERNAL GRANDPARENT(S)...................... 2
</span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>
<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             PATERNAL GRANDPARENT(S)...................... 3
</span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO 

In [25]:
consequitivedots = re.compile(r'\.{3,}')

In [26]:
for p in a3:
    if len(p.find_all(string = consequitivedots))>0:
        print(p)

<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             BIOLOGICAL FATHER....................................... 1<b>
</b></span><span style="font-family:Wingdings;color:black;layout-grid-mode:
line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3C</b></span></p>
<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             MATERNAL GRANDPARENT(S)...................... 2
</span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>
<p class="MsoNormal" style="line-height:150%"><span style="color:black;
layout-grid-mode:line">                             PATERNAL GRANDPARENT(S)...................... 3
</span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO 

In [27]:
answers = [p for p in a3 if len(p.find_all(string = consequitivedots))>0]

In [28]:
answers

[<p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             BIOLOGICAL FATHER....................................... 1<b>
 </b></span><span style="font-family:Wingdings;color:black;layout-grid-mode:
 line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3C</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             MATERNAL GRANDPARENT(S)...................... 2
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:line"> <b>GO TO A3B1</b></span></p>,
 <p class="MsoNormal" style="line-height:150%"><span style="color:black;
 layout-grid-mode:line">                             PATERNAL GRANDPARENT(S)...................... 3
 </span><span style="font-family:Wingdings;color:black;layout-grid-mode:line">è</span><span style="color:black;layout-grid-mode:l

In [29]:
print(answers[0].prettify())

<p class="MsoNormal" style="line-height:150%">
 <span style="color:black;
layout-grid-mode:line">
  BIOLOGICAL FATHER....................................... 1
  <b>
  </b>
 </span>
 <span style="font-family:Wingdings;color:black;layout-grid-mode:
line">
  è
 </span>
 <span style="color:black;layout-grid-mode:line">
  <b>
   GO TO A3C
  </b>
 </span>
</p>



In [30]:
answers = [p for p in a3 if len(p.find_all(string = consequitivedots))>0]
for a in answers:
    print(a.get_text().split(".")[0])
    print(a.get_text().split(".")[-1])

                             BIOLOGICAL FATHER
 1
è GO TO A3C
                             MATERNAL GRANDPARENT(S)
 2
è GO TO A3B1
                             PATERNAL GRANDPARENT(S)
 3
è GO TO A3B1
                             OTHER RELATIVE(S)
 4
è GO TO A3B1
                             FRIEND
 5
è GO TO A3B1
                             FOSTER CARE
 6
                             ADOPTIVE
PARENT
 7 è THANK MOTHER AND


In [31]:
def clean_text(string):
    return string.replace("\n"," ").strip()

def get_answers(start_idx,stop_idx):
    if np.isnan(start_idx) or np.isnan(stop_idx):
        return ([],0)
    else:
        answers = [p for p in ps[int(start_idx):int(stop_idx)] if len(p.find_all(string = consequitivedots))>0]
        answers = [(clean_text(a.get_text().split(".")[0]),clean_text(a.get_text().split(".")[-1])) for a in answers]
        return(answers,len(answers)) 

In [32]:
df['answers'], df['n_answers'] = zip(*map(get_answers, df['ps_idx'], df['ps_idx_next']))

In [33]:
x = 0
while x < len(df.answers):
    ans = df.answers[x]
    for i in range(len(ans)): #for each answer choice,
        s = ans[i][1]        # store number string
        a = re.sub('è',' --> ', s) # sub the arrow in for skip pattern
        t = (ans[i][0],a) # create a new tuple with arrow
        ans[i] = t
    for y in range(len(ans)):
        df.loc[x,'val{0}'.format(y)] = ans[y][1] #create new 'val(y)' column
        df.loc[x,'lab{0}'.format(y)] = ans[y][0] #create new 'lab(y)' column
        
    x += 1

In [34]:
df.head(10)

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe,ps_text,ps_idx,ps_idx_next,answers,...,val122,lab122,val123,lab123,val124,lab124,val125,lab125,val126,lab126
0,3916,m3a2,How much of the time does the child live with ...,A2,0,A2. How much of the time does (CHILD...,A2. How much of the time does (CHILD...,79.0,95.0,"[(All or most of the time,, 1 --> GO TO A4),...",...,,,,,,,,,,
1,3917,m3a3,How many months ago did he/she stop living wit...,A3,0,A3. How many months ago did (he/she)...,A3. How many months ago did (he/she)...,95.0,105.0,"[(THE TIME, -10)]",...,,,,,,,,,,
2,3918,m3a3a,Who does child usually live with?,A3A,0,A3. How many months ago did (he/she)...,A3A. Who does (CHILD) (usually) live with?,105.0,119.0,"[(BIOLOGICAL FATHER, 1 --> GO TO A3C), (MATE...",...,,,,,,,,,,
3,3919,m3a3b,Are child's foster parents related to you?,A3B,0,A3. How many months ago did (he/she)...,A3B. Are (CHILD's) foster parents relat...,119.0,128.0,"[(YES, 1), (NO, 2)]",...,,,,,,,,,,
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...,A3B1,0,A3. How many months ago did (he/she)...,A3B1. Does (PERSON IN A3A) receive any ki...,128.0,136.0,"[(YES, 1), (NO, 2), (DON’T KNOW, -2), (REFUSED...",...,,,,,,,,,,
5,3921,m3a3c,About how many months has child been living th...,A3C,0,A3. How many months ago did (he/she)...,A3C. About how many months has (CHILD) ...,136.0,143.0,"[(LESS THAN ONE MONTH, 0)]",...,,,,,,,,,,
6,3922,m3a3d,Do you expect child to live with you again dur...,A3D,0,A3. How many months ago did (he/she)...,A3D. Do you expect (CHILD) to live with...,143.0,149.0,"[(YES, 1), (NO, 2)]",...,,,,,,,,,,
7,3923,m3a3e,About how many days did you see child in past ...,A3E,0,A3. How many months ago did (he/she)...,A3E. About how many days did you see (C...,149.0,156.0,"[(NONE, 0)]",...,,,,,,,,,,
8,3924,m3a3f,Who does child live with when he/she is not li...,A3F,0,A3. How many months ago did (he/she)...,A3F. Who does (CHILD) live with when (h...,156.0,166.0,"[(BIOLOGICAL FATHER, 1), (MATERNAL GRANDPARENT...",...,,,,,,,,,,
9,3925,m3a3g,How many days did child live with you out of t...,A3G,0,A3. How many months ago did (he/she)...,A3G. How many days did (CHILD) live wit...,166.0,175.0,[],...,,,,,,,,,,


In [35]:
df.probe

0       A2.           How much of the time does (CHILD...
1       A3.           How many months ago did (he/she)...
2       A3.           How many months ago did (he/she)...
3       A3.           How many months ago did (he/she)...
4       A3.           How many months ago did (he/she)...
5       A3.           How many months ago did (he/she)...
6       A3.           How many months ago did (he/she)...
7       A3.           How many months ago did (he/she)...
8       A3.           How many months ago did (he/she)...
9       A3.           How many months ago did (he/she)...
10      A4.           Next, I have a few questions abo...
11      A4.           Next, I have a few questions abo...
12      A4.           Next, I have a few questions abo...
13      A4.           Next, I have a few questions abo...
14      A4.           Next, I have a few questions abo...
15      A4.           Next, I have a few questions abo...
16      A4.           Next, I have a few questions abo...
17      A4.   

In [36]:
df.columns.get_loc('val0')

11

In [37]:
i = 0
while i < len(df.ps_text):
    if df.multiple_answers[i] == 1:
        text = ''
        df.loc[i,'probe'] = df.loc[i,'ps_text']
        values = df.loc[i,'val0':].tolist()
        part = (df.name[i].lstrip('m3').split('_')[1])
        match = re.match('[0-9]{1,3}',part)
        if match:
            if part in values:
                index = values.index(part) +1
                text = values[index]
        df.loc[i,'ps_text'] = text
    i += 1

In [38]:
print(df.ps_text[11])

A4A.        Would you say you are romantically involved on a steady basis, or are you in an on-again - off-again relationship?


In [39]:
df.answers[11]

[('STEADY', '1'), ('ON-AGAIN, OFF-AGAIN', '2')]

In [40]:
df.head(20)

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe,ps_text,ps_idx,ps_idx_next,answers,...,val122,lab122,val123,lab123,val124,lab124,val125,lab125,val126,lab126
0,3916,m3a2,How much of the time does the child live with ...,A2,0,A2. How much of the time does (CHILD...,A2. How much of the time does (CHILD...,79.0,95.0,"[(All or most of the time,, 1 --> GO TO A4),...",...,,,,,,,,,,
1,3917,m3a3,How many months ago did he/she stop living wit...,A3,0,A3. How many months ago did (he/she)...,A3. How many months ago did (he/she)...,95.0,105.0,"[(THE TIME, -10)]",...,,,,,,,,,,
2,3918,m3a3a,Who does child usually live with?,A3A,0,A3. How many months ago did (he/she)...,A3A. Who does (CHILD) (usually) live with?,105.0,119.0,"[(BIOLOGICAL FATHER, 1 --> GO TO A3C), (MATE...",...,,,,,,,,,,
3,3919,m3a3b,Are child's foster parents related to you?,A3B,0,A3. How many months ago did (he/she)...,A3B. Are (CHILD's) foster parents relat...,119.0,128.0,"[(YES, 1), (NO, 2)]",...,,,,,,,,,,
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...,A3B1,0,A3. How many months ago did (he/she)...,A3B1. Does (PERSON IN A3A) receive any ki...,128.0,136.0,"[(YES, 1), (NO, 2), (DON’T KNOW, -2), (REFUSED...",...,,,,,,,,,,
5,3921,m3a3c,About how many months has child been living th...,A3C,0,A3. How many months ago did (he/she)...,A3C. About how many months has (CHILD) ...,136.0,143.0,"[(LESS THAN ONE MONTH, 0)]",...,,,,,,,,,,
6,3922,m3a3d,Do you expect child to live with you again dur...,A3D,0,A3. How many months ago did (he/she)...,A3D. Do you expect (CHILD) to live with...,143.0,149.0,"[(YES, 1), (NO, 2)]",...,,,,,,,,,,
7,3923,m3a3e,About how many days did you see child in past ...,A3E,0,A3. How many months ago did (he/she)...,A3E. About how many days did you see (C...,149.0,156.0,"[(NONE, 0)]",...,,,,,,,,,,
8,3924,m3a3f,Who does child live with when he/she is not li...,A3F,0,A3. How many months ago did (he/she)...,A3F. Who does (CHILD) live with when (h...,156.0,166.0,"[(BIOLOGICAL FATHER, 1), (MATERNAL GRANDPARENT...",...,,,,,,,,,,
9,3925,m3a3g,How many days did child live with you out of t...,A3G,0,A3. How many months ago did (he/she)...,A3G. How many days did (CHILD) live wit...,166.0,175.0,[],...,,,,,,,,,,


In [41]:
df.drop(['answers','n_answers','multiple_answers','question_number','ps_idx','ps_idx_next'],axis=1,inplace=True)

In [42]:
df.head(20)

Unnamed: 0,position,name,varlab,probe,ps_text,val0,lab0,val1,lab1,val2,...,val122,lab122,val123,lab123,val124,lab124,val125,lab125,val126,lab126
0,3916,m3a2,How much of the time does the child live with ...,A2. How much of the time does (CHILD...,A2. How much of the time does (CHILD...,1 --> GO TO A4,"All or most of the time,",2 --> GO TO A3F,"About half of the time,",3 --> GO TO A3,...,,,,,,,,,,
1,3917,m3a3,How many months ago did he/she stop living wit...,A3. How many months ago did (he/she)...,A3. How many months ago did (he/she)...,-10,THE TIME,,,,...,,,,,,,,,,
2,3918,m3a3a,Who does child usually live with?,A3. How many months ago did (he/she)...,A3A. Who does (CHILD) (usually) live with?,1 --> GO TO A3C,BIOLOGICAL FATHER,2 --> GO TO A3B1,MATERNAL GRANDPARENT(S),3 --> GO TO A3B1,...,,,,,,,,,,
3,3919,m3a3b,Are child's foster parents related to you?,A3. How many months ago did (he/she)...,A3B. Are (CHILD's) foster parents relat...,1,YES,2,NO,,...,,,,,,,,,,
4,3920,m3a3b1,Does (person in A3A) receive any payment for t...,A3. How many months ago did (he/she)...,A3B1. Does (PERSON IN A3A) receive any ki...,1,YES,2,NO,-2,...,,,,,,,,,,
5,3921,m3a3c,About how many months has child been living th...,A3. How many months ago did (he/she)...,A3C. About how many months has (CHILD) ...,0,LESS THAN ONE MONTH,,,,...,,,,,,,,,,
6,3922,m3a3d,Do you expect child to live with you again dur...,A3. How many months ago did (he/she)...,A3D. Do you expect (CHILD) to live with...,1,YES,2,NO,,...,,,,,,,,,,
7,3923,m3a3e,About how many days did you see child in past ...,A3. How many months ago did (he/she)...,A3E. About how many days did you see (C...,0,NONE,,,,...,,,,,,,,,,
8,3924,m3a3f,Who does child live with when he/she is not li...,A3. How many months ago did (he/she)...,A3F. Who does (CHILD) live with when (h...,1,BIOLOGICAL FATHER,2,MATERNAL GRANDPARENT(S),3,...,,,,,,,,,,
9,3925,m3a3g,How many days did child live with you out of t...,A3. How many months ago did (he/she)...,A3G. How many days did (CHILD) live wit...,,,,,,...,,,,,,,,,,


In [43]:
df.to_csv(path_or_buf = "m3Qtext.csv")