In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [2]:
import sys
sys.getdefaultencoding()

'utf-8'

Open the questionnaire file (word document version of the questionnaire saved as html):

In [3]:
questionnaire_file = open("FF_m2_Questionnaire.htm", "r") 

In [4]:
soup=BeautifulSoup(questionnaire_file, "html.parser")

In [5]:
ps=[p for p in soup.find_all('p')]

Opening the excel file with all the questions for p6, using the pandas python library:

In [6]:
df=pd.read_excel("FFCm2.xlsx") #this is an excel file with all the questions for p6

In [7]:
df.head()

Unnamed: 0,position,name,varlab
0,1842,m2a3,How much of the time does the child live with ...
1,1843,m2a4,How many months ago did (he/she) stop living w...
2,1844,m2a4a,Who does child (usually) live with?
3,1845,m2a4b,About how long has child been living there (mo...
4,1846,m2a4c,Do you expect the child to live with you (agai...


In [8]:
df.shape

(812, 3)

In [9]:
reg = re.compile(r'(?P<section>[A-Za-z]*\d*)(?P<rest>.*)')

def format_q_num(name):
    """takes in the question numbers in the name column and outputs them in the format
    they appear in the text"""
    name=name.lstrip('m2')
    name=reg.search(name)
    section=name.group('section').upper()
    rest=name.group('rest').upper()
    return section+rest

In [10]:
df['question_number']=df.name.apply(lambda x: format_q_num(x))

In [11]:
df['multiple_answers']=df.question_number.apply(lambda x: 1 if re.match('[A-Z]\d*[A-Z]\d*[A-Z]',x) else 0) 
#adds a column where 1 indicates that it is one of the questions with multiple entries in the data frame
#I think these are mostly (only?) the "select all that apply" questions

In [12]:
df[60:70]

Unnamed: 0,position,name,varlab,question_number,multiple_answers
60,1902,m2b5c1,How much did child weigh on that day?-(Pounds),B5C1,0
61,1903,m2b5c2,How much did child weigh on that day?-(Ounces),B5C2,0
62,1904,m2b6,How many times since birth has child been to h...,B6,0
63,1905,m2b7,How many times since birth has child been to h...,B7,0
64,1906,mx2b7,TCO-How many times has child been seen by heal...,X2B7,0
65,1907,m2b7a,How many times since birth has child been to h...,B7A,0
66,1908,m2b8,How many times since birth has child been to e...,B8,0
67,1909,m2b8a,How many visits to emergency room for accident...,B8A,0
68,1910,m2b9,"Since leaving the hospital at birth, has child...",B9,0
69,1911,m2b10,"Since child was born, how many times has he/sh...",B10,0


In [13]:
def find_question(q_number):
    """finds where the question number appears in the text - only if it only appears once"""
    ps_text = []
    ps_idx = []
    ps_startswith = []
    for idx, p in enumerate(ps):
        if q_number+"." in p.get_text():
            if p.get_text().lstrip().startswith(q_number+"."):
                ps_text.append(p.get_text().lstrip())
                ps_idx.append(idx)
            else:
                start = p.get_text().find(q_number+'.')
                ps_text.append(p.get_text().lstrip()[start:])
                ps_idx.append(idx)
    if len(ps_idx)!=1:
        return np.nan, np.nan
    else:
        return ps_text[0].replace("\n"," ").strip(), ps_idx[0]

In [34]:
b17a = ps[727:729]
b17a

[<p class="MsoNormal"><span style='font-size:11.0pt;font-family:"Arial",sans-serif'>B17A.</span></p>,
 <p class="MsoNormal"><span style='font-size:11.0pt;font-family:"Arial",sans-serif;
   color:windowtext'>(He/She) tends to be shy.................... </span></p>]

In [14]:
def find_probe(q_number):
    probe = re.findall('[A-Z]{,1}\d{,2}', q_number)[0]
    return find_question(probe)[0]

In [15]:
for i in range(len(df.question_number)):
    q_number = df.loc[i,'question_number']
    if re.match('^[A-Z]{,1}\d{,2}', q_number):
        df.loc[i,'probe'] = find_probe(q_number)
    else:
        df.loc[i,'probe'] = "None"

In [16]:
df['ps_text'], df['ps_idx'] = zip(*df['question_number'].map(find_question))

In [17]:
df.head(10)

Unnamed: 0,position,name,varlab,question_number,multiple_answers,probe,ps_text,ps_idx
0,1842,m2a3,How much of the time does the child live with ...,A3,0,,,
1,1843,m2a4,How many months ago did (he/she) stop living w...,A4,0,A4. How many months ago did (he/she)...,A4. How many months ago did (he/she)...,147.0
2,1844,m2a4a,Who does child (usually) live with?,A4A,0,A4. How many months ago did (he/she)...,A4A. Who does (CHILD) (usually) live with?,158.0
3,1845,m2a4b,About how long has child been living there (mo...,A4B,0,A4. How many months ago did (he/she)...,A4B. About how long has (CHILD) been li...,170.0
4,1846,m2a4c,Do you expect the child to live with you (agai...,A4C,0,A4. How many months ago did (he/she)...,A4C. Do you expect (CHILD) to live with...,179.0
5,1847,m2a4d,How many days did you see the child in the pas...,A4D,0,A4. How many months ago did (he/she)...,A4D. About how many days did you see (C...,185.0
6,1848,m2a6,What was your relationship with father when ch...,A6,0,"A6. Next, I have a few questions abo...","A6. Next, I have a few questions abo...",193.0
7,1849,m2a6b,What was your relationship with father when ch...,A6B,0,"A6. Next, I have a few questions abo...",A6B. What was your relationship with (F...,217.0
8,1850,m2a6c,"When child was born, were you and father livin...",A6C,0,"A6. Next, I have a few questions abo...","A6C. When (CHILD) was born, were you an...",229.0
9,1851,m2a7,What is your relationship with father now?,A7,0,A7. What is your relationship with (...,A7. What is your relationship with (...,238.0


In [18]:
np.isnan(df.ps_idx).sum() #number of questions where the question number doesn't appear or appears more than once

316

In [19]:
#get index of <p> tag for the next question so we can split the questions (can't just shift because some questions have more than one answer or missing values)
all_idx=np.array(df.ps_idx)
all_idx=all_idx[~np.isnan(all_idx)]
all_idx=np.append(all_idx, [len(ps)])
df['ps_idx_next']=df.ps_idx.apply(lambda x: all_idx[all_idx > x].min() if ~np.isnan(x) else np.nan)

In [20]:
consequitivedots = re.compile(r'\.{3,}')

In [21]:
def clean_text(string):
    return string.replace("\n"," ").strip()

def get_pairs(p):
    if re.match('^[a-z]{1}', p.get_text().split('..')[0]):
        if re.match('[()]{1}', p.get_text().split('..')[-1]):
            pairs = [p.find_previous('p').get_text() + ' ' + p.get_text().split('..')[0], p.get_text().split("(")[0].split('..')[-1]]
        else:
            pairs = [p.find_previous('p').get_text() + ' ' + p.get_text().split('..')[0], p.get_text().split('..')[-1]]
    else:
        if re.match('[()]{1}', p.get_text().split('.')[-1]):
            pairs = [p.get_text().split('..')[0], p.get_text().split("(")[0].split('..')[-1] ]
        else:
            pairs = [p.get_text().split('..')[0], p.get_text().split('..')[-1]]
    pairs = [clean_text(text) for text in pairs]
    return pairs
        
def get_answers(start_idx,stop_idx):
    if np.isnan(start_idx) or np.isnan(stop_idx):
        return ([],0)
    else:
        answers = [p for p in ps[int(start_idx):int(stop_idx)] if len(p.find_all(string = consequitivedots))>0]
        answers = [get_pairs(a) for a in answers]
        return(answers, len(answers)) 
        

In [22]:
df['answers'], df['n_answers'] = zip(*map(get_answers, df['ps_idx'], df['ps_idx_next']))

In [23]:
(df['answers'][20])

[]

In [24]:
x = 0
while x < len(df.answers):
    ans = df.answers[x]
    for i in range(len(ans)): #for each answer choice,
        s = ans[i][1]        # store number string
        a = re.sub('è',' --> ', s) # sub the arrow in for skip pattern
        t = (ans[i][0],a) # create a new tuple with arrow
        ans[i] = t
    for y in range(len(ans)):
        df.loc[x,'val{0}'.format(y)] = ans[y][1] #create new 'val(y)' column
        df.loc[x,'lab{0}'.format(y)] = ans[y][0] #create new 'lab(y)' column
        
    x += 1

In [25]:
values = []
for i in range(0,18):
    values = values + ['val{0}'.format(i)]
    i += 1
for v in values:
    df[v] = df[v].map(lambda x: str(x))
    df[v] = df[v].map(lambda x: x.lstrip('.').strip())

In [26]:
df['val0']

0                    nan
1                    -10
2                      1
3                      0
4                      1
5                    nan
6      1  -->  GO TO A6C
7                      1
8                      1
9                      1
10                     1
11     1  -->  GO TO A8A
12                     0
13                   nan
14                     1
15                     1
16                   nan
17                   nan
18                     1
19                   nan
20                   nan
21                     1
22                   nan
23                   nan
24                     1
25                     1
26                     1
27                   nan
28                   nan
29                   nan
             ...        
782                  nan
783                  nan
784                  nan
785                  nan
786                  nan
787                  nan
788                   -2
789                    1
790                    1


In [27]:
i = 0
while i < len(df.ps_text):
    if df.multiple_answers[i] == 1:
        df.loc[i,'val0'] = '1'
        df.loc[i,'lab0'] = 'YES'
        df.loc[i,'val1'] = '2'
        df.loc[i,'lab1'] = 'NO'
    i += 1

In [28]:
df.drop(['answers','n_answers','question_number','ps_idx','ps_idx_next'],axis=1,inplace=True)

In [29]:
df.head(20)

Unnamed: 0,position,name,varlab,multiple_answers,probe,ps_text,val0,lab0,val1,lab1,...,val104,lab104,val105,lab105,val106,lab106,val107,lab107,val108,lab108
0,1842,m2a3,How much of the time does the child live with ...,0,,,,,,,...,,,,,,,,,,
1,1843,m2a4,How many months ago did (he/she) stop living w...,0,A4. How many months ago did (he/she)...,A4. How many months ago did (he/she)...,-10,(CHILD) MOST OR ALL OF THE TIME,,,...,,,,,,,,,,
2,1844,m2a4a,Who does child (usually) live with?,0,A4. How many months ago did (he/she)...,A4A. Who does (CHILD) (usually) live with?,1,BIOLOGICAL FATHER,2,MATERNAL GRANDPARENT(S),...,,,,,,,,,,
3,1845,m2a4b,About how long has child been living there (mo...,0,A4. How many months ago did (he/she)...,A4B. About how long has (CHILD) been li...,0,LESS THAN ONE MONTH,,,...,,,,,,,,,,
4,1846,m2a4c,Do you expect the child to live with you (agai...,0,A4. How many months ago did (he/she)...,A4C. Do you expect (CHILD) to live with...,1,YES,2,NO,...,,,,,,,,,,
5,1847,m2a4d,How many days did you see the child in the pas...,0,A4. How many months ago did (he/she)...,A4D. About how many days did you see (C...,,,,,...,,,,,,,,,,
6,1848,m2a6,What was your relationship with father when ch...,0,"A6. Next, I have a few questions abo...","A6. Next, I have a few questions abo...",1 --> GO TO A6C,"Married,",2 --> GO TO A6C,"Romantically involved,",...,,,,,,,,,,
7,1849,m2a6b,What was your relationship with father when ch...,0,"A6. Next, I have a few questions abo...",A6B. What was your relationship with (F...,1,"Married,",2,"Romantically involved,",...,,,,,,,,,,
8,1850,m2a6c,"When child was born, were you and father livin...",0,"A6. Next, I have a few questions abo...","A6C. When (CHILD) was born, were you an...",1,"All or most of the time,",2,"Some of the time,",...,,,,,,,,,,
9,1851,m2a7,What is your relationship with father now?,0,A7. What is your relationship with (...,A7. What is your relationship with (...,1,"Married,",2,"Romantically involved,",...,,,,,,,,,,


In [30]:
df.to_csv(path_or_buf = "m2Qtext.csv")