In [1]:
import pandas as pd
import numpy as np
import textract
tpath = '/Users/alex/Documents/alexdatasci/data_files/MIDUS_1/' \
            + 'stata/ICPSR_02760/DS0001/02760-0001-' \
            + 'Documentation-scales.pdf'

In [2]:
# Extract text 
vartext = textract.process(tpath)
vartext = vartext.splitlines()
vartext = pd.Series(vartext)
vartext = vartext.str.decode('utf-8').str.strip()
vartext.replace('',np.nan,inplace=True)
vartext.dropna(inplace=True)
vartext.reset_index(drop=True,inplace=True)

In [3]:
# Drop preamble
vartext = vartext[177:].reset_index(drop=True)

In [4]:
# Extract relevant information: headings, scale names, scaleprefix

# Scale prefix
scaleprefix = pd.Series([1 if ']:' in i 
                         or (str(i).endswith(']') 
                             and '-' not in str(i) 
                             and not str(i).startswith(':')
                            )
                         else 0 for i in vartext])

# Scale names take a few forms:
#    Line prior to "[VARNAME]: (cont", 
scalefilt1 = pd.Series([1 if str(i).split('[')[0] == ''
                           and ']: (cont' in i else 0 
                           for i in vartext]).iloc[1:]
scalefilt1 = scalefilt1.append(pd.Series([0]),ignore_index=True)

#    Name prior to "[VARNAME]:BLANK": This is captured
#    in a different scaleprefix

#    Line prior to 'Items:' and Name prior to ':BLANK'
tempfilt = [True if i.split(':')[0]=='Items' else False for i in vartext]
tempfilt = pd.Series(tempfilt[1:]).append(pd.Series([0]), ignore_index=True)
scalefilt2 = np.array([False]).repeat(len(vartext))
for i in range(1,len(vartext)):
    if tempfilt[i]==1 and "]:" not in vartext[i]:
        scalefilt2[i]=True
    else:
        scalefilt2[i]=False

# Scale headings
scaleheading = pd.Series([1 if str(i).split(' (')[0].isupper()
                          and '[' not in i 
                          and i != 'AND' 
                          and '(' not in str(i).split(' (')[0]
                          and len(i) < 50
                          else 0 for i in vartext])

# Filter out extraneous text
vartext = vartext[(scaleprefix+scalefilt1+scalefilt2+scaleheading)>0]

In [5]:
# Some scale names are on a separate line, others are
# listed before the varname.
# First, drop text after "[VAR]:"
vartext = pd.Series([i[0] for i in vartext.str.split(':')])

In [6]:
# Then split examples with "Scale_name [VARNAME]" and flatten
vartext = pd.Series([i for j in vartext.str.split('[') for i in j])
# Drop ''
vartext = vartext[vartext!=''].reset_index(drop=True)

In [7]:
# Make series for heading, scale, and varnames
heading = np.array([None]).repeat(len(vartext))
heading[0] = vartext[0]
for i in range(1,len(vartext)):
    if vartext[i].split('(')[0].isupper() \
        and ']' not in vartext[i]:
        heading[i]=vartext[i]
    else:
        heading[i]=heading[i-1]

scale = np.array([None]).repeat(len(vartext))
scale[0] = None
for i in range(1,len(vartext)):
    if vartext[i].split('(')[0].isupper() \
        and ']' not in vartext[i]:
        scale[i]=None
    elif vartext[i].isupper()==False \
        and vartext[i].islower()==False:
        scale[i]=vartext[i]
    elif vartext[i].isupper() \
        and ']' in vartext[i]:
        scale[i]=scale[i-1]

varnames = [i.split(']')[0] if i.isupper() and ']' in i else None \
    for i in vartext]

# Make df
scaledf = pd.DataFrame({
    'Heading':heading,
    'Scale':scale,
    'Varname':varnames
})

In [8]:
# Some scales are not real scales
# They appear when Scale!=None and Varname==None
scaledf[233:236]

Unnamed: 0,Heading,Scale,Varname
233,BODY INDICES,Body Mass Index,A1SBMI
234,BODY INDICES,"Scales.rtf, p. 97]",
235,SEEING DOCTORS & MENTAL HEALTH PROFESSIONALS,,


In [9]:
# Drop those
tempfilt = scaledf['Scale'].str.contains(']')!=True
scaledf = scaledf[tempfilt]

In [10]:
# Clean up dataframe:

# Some headings/scales have no varnames, but we want to keep them.
# We can ID these as headings.value_counts()==2 and no varnames.

scaledf['tempvar']=[scaledf['Heading'].value_counts()[i] for i in scaledf['Heading']]

In [11]:
# Drop those wtih varname=None except those that satisfy
# previous example

dropfilt = ~(scaledf['tempvar']==2 & ~scaledf['Scale'].isna()) \
    & ~scaledf['Varname'].isna()
scaledf = scaledf[dropfilt].reset_index(drop=True).drop(['tempvar'],axis=1)

In [12]:
scaledf.head()

Unnamed: 0,Heading,Scale,Varname
0,DEPRESSION,Depressed Affect,A1PDEPAF
1,DEPRESSION,Depressed Affect,A1PDEPAD
2,DEPRESSION,Anhedonia,A1PANHED
3,DEPRESSION,Anhedonia,A1PANHDX
4,DEPRESSION,Depression,A1PDEPRE
