In [1]:
!pip install pypandoc
!pip install pandoc
import re
import pandas as pd
import os
import pypandoc



In [16]:
# Read RTF File and convert to plain text
def read_rtf(file_path):
    """Read RTF file and convert it to plain text using pypandoc."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            rtf_content = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin1') as file:
            rtf_content = file.read()

    text = pypandoc.convert_text(rtf_content, 'plain', format='rtf')
    #text = pypandoc.convert_file(file_path, 'plain')
    return text

In [17]:
def parse_rtf_content(text):
    """Parse RTF content to extract variable information."""
    # Extract the filename
    file_name_match = re.search(r'File Name\s*=\s*([^\n]+)', text)
    file_name =file_name_match.group(1).strip() if file_name_match else 'Unknown'
    
    # Debugging output for file name 
    print(f"Parsing file: {file_name} from {file_path}")
    
    # Extract Variables 
    data = []
    # Regex to find each variable block
    variable_blocks = re.finditer(r"Pos\. = (\d+).*?Variable = (.*?)\s+Variable label = (.*?)(?=\n+Pos\. = |\Z)", text, re.DOTALL)
    block_count = 0
    for block in variable_blocks:
        block_count += 1
        position = block.group(1).strip()
        variable = block.group(2).strip()
        variable_label = block.group(3).strip()

        # Find data type and measurement level
        data_type_match = re.search(r"This variable is\s+(\w+), the SPSS measurement level is (\w+)", block.group(0))
        data_type = f"{data_type_match.group(1)}, {data_type_match.group(2)}" if data_type_match else "Unknown"

        # Find value labels
        value_labels = []
        value_label_matches = re.finditer(r"Value = (.*?)\s+Label = (.*?)\n", block.group(0))
        for vl_match in value_label_matches:
            value = vl_match.group(1).strip()
            label = vl_match.group(2).strip()
            value_labels.append(f"{value} = {label}")

        value_label_options = "; ".join(value_labels)

        data.append({
            "file_name": file_name,
            "position": position,
            "variable": variable,
            "variable_label": variable_label,
            "data_type": data_type,
            "value_label_options": value_label_options
        })

    # Debugging output for number of blocks parsed
    print(f"Parsed {block_count} variable blocks from {file_name}")
        
    return data


In [20]:
# Paths to the RTF files
month_9_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-4683-tab-9m/mrdoc/ukda_data_dictionaries' 
file_paths = [os.path.join(month_9_dir, file) for file in os.listdir(month_9_dir) if
              os.path.isfile(os.path.join(month_9_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data9m.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs1_parent_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-4683-tab-9m/mrdoc/ukda_data_dictionaries/mcs1_parent_interview_ukda_data_dictionary.rtf
Parsed 665 variable blocks from mcs1_parent_interview
Parsing file: mcs1_parent_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-4683-tab-9m/mrdoc/ukda_data_dictionaries/mcs1_parent_derived_ukda_data_dictionary.rtf
Parsed 39 variable blocks from mcs1_parent_derived
Parsing file: mcs1_cm_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-4683-tab-9m/mrdoc/ukda_data_dictionaries/mcs1_cm_derived_ukda_data_dictionary.rtf
Parsed 15 variable blocks from mcs1_cm_derived
Parsing file: mcs1_geographically_linked_data from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-4683-tab-9m/mrdoc/ukda_data_dictionaries/mcs1_geographically_linked_da

In [21]:
df.groupby('file_name')['variable'].count()

file_name
mcs1_cm_derived                     15
mcs1_cm_interview                    4
mcs1_family_derived                 51
mcs1_family_interview               23
mcs1_geographically_linked_data     42
mcs1_hhgrid                         33
mcs1_parent_cm_interview           176
mcs1_parent_derived                 39
mcs1_parent_interview              665
mcs1_proxy_partner_interview        68
Name: variable, dtype: int64

In [13]:
df[df['file_name'] == 'mcs1_parent_interview']

Unnamed: 0,file_name,position,variable,variable_label,data_type,value_label_options
0,mcs1_parent_interview,1,MCSID,MCS Research ID - Anonymised\nFamily/Household...,Unknown,
1,mcs1_parent_interview,2,APNUM00,Person number within an MCS\nfamily (excl Coho...,"numeric, NOMINAL",
2,mcs1_parent_interview,3,AELIG00,Eligibility for survey:\nWhether resp eligible...,"numeric, NOMINAL",1.0 = Main Interview; 2.0 = Partner Interview;...
3,mcs1_parent_interview,4,ARESP00,Response in survey: Whether\nrespondent (of EL...,"numeric, NOMINAL",1.0 = Main Interview; 2.0 = Partner Interview;...
4,mcs1_parent_interview,5,ACBAGE00,Baby s age in months\nThis variable is numeric...,"numeric, NOMINAL",-9.0 = Refusal; -8.0 = Don't Know
...,...,...,...,...,...,...
660,mcs1_parent_interview,661,APPART0R,Party voted for recoded\nThis variable is nume...,"numeric, NOMINAL",-9.0 = Refusal; -8.0 = Dont Know; -1.0 = Not a...
661,mcs1_parent_interview,662,APPANI0R,Party voted for (NI)\nrecoded\nThis variable i...,"numeric, NOMINAL",-9.0 = Refusal; -8.0 = Dont Know; -1.0 = Not a...
662,mcs1_parent_interview,663,apnete00_r30,Non-res. parents\nEthnic Group (England) [coun...,"numeric, NOMINAL",-9.0 = Refusal; -8.0 = Don't Know; -1.0 = Not ...
663,mcs1_parent_interview,664,apnetw00_r30,Non-res. parents\nEthnic Group (Wales) [counts...,"numeric, NOMINAL",-9.0 = Refusal; -8.0 = Don't Know; -1.0 = Not ...


In [18]:
# Paths to the RTF files
years_3_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5350-tab-3y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_3_dir, file) for file in os.listdir(years_3_dir) if
              os.path.isfile(os.path.join(years_3_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data3y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs2_geographically_linked_data from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5350-tab-3y/mrdoc/ukda_data_dictionaries/mcs2_geographically_linked_data_ukda_data_dictionary.rtf
Parsed 43 variable blocks from mcs2_geographically_linked_data
Parsing file: mcs2_parent_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5350-tab-3y/mrdoc/ukda_data_dictionaries/mcs2_parent_derived_ukda_data_dictionary.rtf
Parsed 36 variable blocks from mcs2_parent_derived
Parsing file: mcs2_proxy_partner_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5350-tab-3y/mrdoc/ukda_data_dictionaries/mcs2_proxy_partner_interview_ukda_data_dictionary.rtf
Parsed 107 variable blocks from mcs2_proxy_partner_interview
Parsing file: mcs2_cm_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5350-tab-3y/mrdoc

In [19]:
df.groupby('file_name')['variable'].count()

file_name
mcs2_cm_cognitive_assessment         553
mcs2_cm_derived                       29
mcs2_cm_interview                     56
mcs2_cm_oral_fluid                    15
mcs2_family_derived                   58
mcs2_geographically_linked_data       43
mcs2_hhgrid                           40
mcs2_neighbourhood_observations       17
mcs2_older_siblings_questionnaire    471
mcs2_parent_cm_interview             455
mcs2_parent_derived                   36
mcs2_parent_interview                960
mcs2_proxy_partner_interview         107
Name: variable, dtype: int64

In [22]:
# Paths to the RTF files
years_5_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5795-tab-5y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_5_dir, file) for file in os.listdir(years_5_dir) if
              os.path.isfile(os.path.join(years_5_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data5y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs3_parent_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5795-tab-5y/mrdoc/ukda_data_dictionaries/mcs3_parent_derived_ukda_data_dictionary.rtf
Parsed 90 variable blocks from mcs3_parent_derived
Parsing file: mcs3_geographically_linked_data from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5795-tab-5y/mrdoc/ukda_data_dictionaries/mcs3_geographically_linked_data_ukda_data_dictionary.rtf
Parsed 43 variable blocks from mcs3_geographically_linked_data
Parsing file: mcs3_family_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5795-tab-5y/mrdoc/ukda_data_dictionaries/mcs3_family_interview_ukda_data_dictionary.rtf
Parsed 56 variable blocks from mcs3_family_interview
Parsing file: mcs3_hhgrid from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-5795-tab-5y/mrdoc/ukda_data_dictionaries/mc

In [23]:
df.groupby('file_name')['variable'].count() #13 files

file_name
mcs3_cm_cognitive_assessment          341
mcs3_cm_derived                        16
mcs3_cm_interview                     113
mcs3_cm_teacher_survey                143
mcs3_family_derived                    45
mcs3_family_interview                  56
mcs3_geographically_linked_data        43
mcs3_hhgrid                            59
mcs3_older_siblings_questionnaire     142
mcs3_parent_cm_interview              674
mcs3_parent_derived                    90
mcs3_parent_interview                1019
mcs3_proxy_partner_interview           63
Name: variable, dtype: int64

In [24]:
# Paths to the RTF files
years_7_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-6411-tab-7y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_7_dir, file) for file in os.listdir(years_7_dir) if
              os.path.isfile(os.path.join(years_7_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data7y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs4_cm_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-6411-tab-7y/mrdoc/ukda_data_dictionaries/mcs4_cm_derived_ukda_data_dictionary.rtf
Parsed 24 variable blocks from mcs4_cm_derived
Parsing file: mcs4_geographically_linked_data from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-6411-tab-7y/mrdoc/ukda_data_dictionaries/mcs4_geographically_linked_data_ukda_data_dictionary.rtf
Parsed 43 variable blocks from mcs4_geographically_linked_data
Parsing file: mcs4_proxy_partner_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-6411-tab-7y/mrdoc/ukda_data_dictionaries/mcs4_proxy_partner_interview_ukda_data_dictionary.rtf
Parsed 125 variable blocks from mcs4_proxy_partner_interview
Parsing file: mcs4_family_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-6411-tab-7y/mrdoc/ukda_

In [25]:
df.groupby('file_name')['variable'].count()

file_name
mcs4_cm_aspirations                  34
mcs4_cm_cognitive_assessment        502
mcs4_cm_derived                      24
mcs4_cm_interview                   184
mcs4_cm_teacher_survey              175
mcs4_family_derived                  49
mcs4_family_interview                36
mcs4_geographically_linked_data      43
mcs4_hhgrid                          60
mcs4_parent_cm_interview           1093
mcs4_parent_derived                  94
mcs4_parent_interview              1780
mcs4_proxy_partner_interview        125
Name: variable, dtype: int64

In [26]:
# Paths to the RTF files
years_11_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-7464-tab-11y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_11_dir, file) for file in os.listdir(years_11_dir) if
              os.path.isfile(os.path.join(years_11_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data11y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs5_family_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-7464-tab-11y/mrdoc/ukda_data_dictionaries/mcs5_family_interview_ukda_data_dictionary.rtf
Parsed 13 variable blocks from mcs5_family_interview
Parsing file: mcs5_geographically_linked_data from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-7464-tab-11y/mrdoc/ukda_data_dictionaries/mcs5_geographically_linked_data_ukda_data_dictionary.rtf
Parsed 42 variable blocks from mcs5_geographically_linked_data
Parsing file: mcs5_parent_cm_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-7464-tab-11y/mrdoc/ukda_data_dictionaries/mcs5_parent_cm_interview_ukda_data_dictionary.rtf
Parsed 862 variable blocks from mcs5_parent_cm_interview
Parsing file: mcs5_hhgrid from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-7464-tab-11y/mrdoc/ukda_

In [27]:
df.groupby('file_name')['variable'].count() #14 files

file_name
mcs5_cm_cognitive_assessment           204
mcs5_cm_derived                         42
mcs5_cm_interview                      289
mcs5_cm_teacher_survey                 214
mcs5_family_derived                     34
mcs5_family_interview                   13
mcs5_geographically_linked_data         42
mcs5_hhgrid                             60
mcs5_parent_cm_interview               862
mcs5_parent_derived                     28
mcs5_parent_income_brackets            455
mcs5_parent_interview                 1386
mcs5_proxy_partner_income_brackets      40
mcs5_proxy_partner_interview           151
Name: variable, dtype: int64

In [28]:
# Paths to the RTF files
years_14_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8156-tab-14y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_14_dir, file) for file in os.listdir(years_14_dir) if
              os.path.isfile(os.path.join(years_14_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data14y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs6_parent_income_brackets from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8156-tab-14y/mrdoc/ukda_data_dictionaries/mcs6_parent_income_brackets_ukda_data_dictionary.rtf
Parsed 1460 variable blocks from mcs6_parent_income_brackets
Parsing file: mcs6_proxy_partner_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8156-tab-14y/mrdoc/ukda_data_dictionaries/mcs6_proxy_partner_interview_ukda_data_dictionary.rtf
Parsed 308 variable blocks from mcs6_proxy_partner_interview
Parsing file: mcs_sweep6_imd_e_2004 from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8156-tab-14y/mrdoc/ukda_data_dictionaries/mcs_sweep6_imd_e_2004_ukda_data_dictionary.rtf
Parsed 11 variable blocks from mcs_sweep6_imd_e_2004
Parsing file: mcs6_parent_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8156-tab-14y

In [29]:
df.groupby('file_name')['variable'].count() # 20 files

file_name
mcs6_cm_accelerometer_derived                   13
mcs6_cm_cognitive_assessment                    75
mcs6_cm_derived                                 41
mcs6_cm_interview                              412
mcs6_cm_tud_harmonised                           9
mcs6_family_derived                             32
mcs6_hhgrid                                     54
mcs6_parent_assessment                          83
mcs6_parent_cm_interview                       406
mcs6_parent_derived                             37
mcs6_parent_income_brackets                   1460
mcs6_parent_interview                         1036
mcs6_proxy_partner_interview                   308
mcs6_tud_parsed_data_app_episode_format         21
mcs6_tud_parsed_data_paper_calendar_format      33
mcs6_tud_parsed_data_web_calendar_format        33
mcs_sweep6_imd_e_2004                           11
mcs_sweep6_imd_n_2004                           10
mcs_sweep6_imd_s_2004                           10
mcs_sweep6_imd_w_2004

In [30]:
# Paths to the RTF files
years_17_dir = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8682-tab-17y/mrdoc/ukda_data_dictionaries'
file_paths = [os.path.join(years_17_dir, file) for file in os.listdir(years_17_dir) if
              os.path.isfile(os.path.join(years_17_dir, file))]
all_data = []

for file_path in file_paths:
    content = read_rtf(file_path)
    all_data.extend(parse_rtf_content(content))

# Create a DataFrame and save to CSV
df = pd.DataFrame(all_data)
csv_path = '/Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/code/extracted_data17y.csv'
df.to_csv(csv_path, index=False)

Parsing file: mcs7_cm_derived from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8682-tab-17y/mrdoc/ukda_data_dictionaries/mcs7_cm_derived_ukda_data_dictionary.rtf
Parsed 31 variable blocks from mcs7_cm_derived
Parsing file: mcs7_parent_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8682-tab-17y/mrdoc/ukda_data_dictionaries/mcs7_parent_interview_ukda_data_dictionary.rtf
Parsed 279 variable blocks from mcs7_parent_interview
Parsing file: mcs7_cm_cognitive_assessment from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8682-tab-17y/mrdoc/ukda_data_dictionaries/mcs7_cm_cognitive_assessment_ukda_data_dictionary.rtf
Parsed 17 variable blocks from mcs7_cm_cognitive_assessment
Parsing file: mcs7_parent_cm_interview from /Users/faith/Desktop/Work/Boston Childrens Hospital/birth-cohort-db/UKMCS/data_dump/UKDA-8682-tab-17y/mrdoc/ukda_data_dictionaries/mcs7_

In [31]:
df.groupby('file_name')['variable'].count() # 10 files

file_name
mcs7_cm_cognitive_assessment     17
mcs7_cm_derived                  31
mcs7_cm_interview               793
mcs7_cm_qualifications          105
mcs7_family_derived              20
mcs7_family_interview           113
mcs7_hhgrid                      65
mcs7_parent_cm_interview         56
mcs7_parent_derived              15
mcs7_parent_interview           279
Name: variable, dtype: int64