# Notebook Setup

In [1]:
from ipumspy import readers, ddi

# Reading the raw data from IPUMS

In [2]:
# Reading the metadata file
ddi_codebook = readers.read_ipums_ddi('ddt.xml')

# Reading the data file
ipums_df = readers.read_microdata(ddi_codebook, "highered_00003.dat")

See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.


# Basic EDA

In [3]:
ipums_df.head()

Unnamed: 0,PERSONID,REFID,YEAR,WEIGHT,SAMPLE,SURID,COHORT,SRVMODE,SEQNUM,AGE,...,NWILL,NWLAY,NWNOND,NWOCNA,NWOT,NWOTP,NWRET,NWSTU,HHLOSE,HHLOSENO
0,10000000000000007,100000007,1993,407.6917,101,1,100,,1,33,...,98,98,98,98,98,,98,98,,
1,10000000000000102,100000012,1993,69.2621,101,1,100,,2,38,...,0,1,0,0,0,,0,0,,
2,10000000000000103,100000013,1993,52.3937,101,1,100,,3,48,...,98,98,98,98,98,,98,98,,
3,10000000000000106,100000016,1993,52.3937,101,1,100,,4,48,...,98,98,98,98,98,,98,98,,
4,10000000000000108,100000018,1993,436.318,101,1,100,,5,28,...,98,98,98,98,98,,98,98,,


In [4]:
ipums_df['YEAR'].unique()

<IntegerArray>
[1993, 1995, 1997, 1999, 2003, 2006, 2008, 2010, 2013]
Length: 9, dtype: Int64

In [5]:
ipums_df.shape

(546713, 357)

Shape should be (546713, 357), if the output of the above cell is nothing this please check if you have all off the raw IPUMS data

# Writing the Data

In [6]:
ipums_df.to_csv('raw_combined.csv',index = False)

# Get feature descriptions

In [12]:
from bs4 import BeautifulSoup  
import pandas as pd

# Getting the xml file
file = open("ddt.xml",'r')
contents = file.read()

# Creating a soup
soup = BeautifulSoup(contents,'xml')    

In [13]:
# Finding all of the var xml tags
attr_list = soup.find_all('var')

attr_extracted = []

# Getting the feature info from the tag
for row in range(len(attr_list)):
    attr_info = [attr_list[row]['ID'],
                 attr_list[row].find('labl').get_text(),
                 attr_list[row].find('concept').get_text().split('--')[0].replace('Variables','').replace("'","").strip(),
                 attr_list[row].find('txt').get_text().replace('"','').replace('\n','').strip()]
    attr_extracted.append(attr_info)

In [14]:
# Creating the dataframe
df = pd.DataFrame(attr_extracted,columns = ['id','name','sub_category','description'], dtype = str)

# Creating a dict to map the subcategories to categories
category_dict = {'Technical':'Technical',
                 'Demographic':'Demographic',
                 'Other':'Other',
                 'Bachelors Degree':'Education',
                 'Highest Degree':'Education',
                 'Most Recent Degree':'Education',
                 'Associates Degree':'Education',
                 'Community College':'Education',
                 'Current Enrollment':'Education',
                 'High School':'Education',
                 'Employment':'Employment',
                 'Occupation':'Employment',
                 'Employer Characteristics':'Employment',
                 'Work Activities':'Employment',
                 'Income':'Employment',
                 'Job Characteristics':'Employment',
                 'Job Satisfaction':'Employment',
                 'Job Training':'Employment',
                 'Work History':'Employment',
                 'International Research':'Employment',
                 'Career Path Jobs':'Employment',
                 'Employment - Other':'Employment'}

df['category'] = df['sub_category'].apply(lambda x: category_dict[x])

In [15]:
# Writing it as a csv with seperator as |. Since the description can have comma in it. | will avoid any issue with extra columns getting created
df.to_csv('feature_info.csv', index = False, sep = '|')