# Exercise 5-4: Get data from a Stata file

In [17]:
import pandas as pd
import sys
sys.path.append('..')
from utils import unzip_file

# Check if the zip file exists, if not we might need to download it but the instructions say it's unzipped.
# However, the previous cell unzips it. Let's make sure we use the right path.

files = unzip_file('gss_stata_with_codebook.zip')

# Find the .DTA file in the list of extracted files
stata_file = [f for f in files if f.endswith('.DTA')][0]

df = pd.read_stata(stata_file, convert_categoricals=False)
df.head()

Unnamed: 0,year,id,wrkstat,hrs1,hrs2,evwork,occ,prestige,wrkslf,wrkgovt,...,neisafe,rlooks,rgroomed,rweight,rhlthend,wtss,wtssnr,wtssall,vstrat,vpsu
0,1972,1,1.0,,,,205.0,50.0,2.0,,...,,,,,,1.0,1.0,0.4446,,
1,1972,2,5.0,,,1.0,441.0,45.0,2.0,,...,,,,,,1.0,1.0,0.8893,,
2,1972,3,2.0,,,,270.0,44.0,2.0,,...,,,,,,1.0,1.0,0.8893,,
3,1972,4,1.0,,,,1.0,57.0,2.0,,...,,,,,,1.0,1.0,0.8893,,
4,1972,5,7.0,,,1.0,385.0,40.0,2.0,,...,,,,,,1.0,1.0,0.8893,,


## Build a DataFrame for the metadata

In [18]:
import pyreadstat

# get the metadata container
_, metadata = pyreadstat.read_dta(stata_file,metadataonly=True)


In [19]:
# print the attributes from DataFrame
print(f"Number of columns: {metadata.number_columns}")
print(f"Number of rows: {metadata.number_rows}")
print(f"Column names: {', '.join(metadata.column_names[:10])}") # Show first 10

Number of columns: 6110
Number of rows: 64814
Column names: year, id, wrkstat, hrs1, hrs2, evwork, occ, prestige, wrkslf, wrkgovt


In [20]:
# build a DataFrame from the metadata
df = pd.DataFrame(metadata.column_labels, index=metadata.column_names)
df.head()
metadata_df = pd.DataFrame(metadata.column_labels, index=metadata.column_names, columns=['description'])
metadata_df.head()

Unnamed: 0,description
year,gss year for this respondent
id,respondent id number
wrkstat,labor force status
hrs1,number of hours worked last week
hrs2,number of hours usually work a week


## Read the data into a DataFrame

In [21]:

# read at least 5 columns from the stata file into a DataFrame
df_subset = pd.read_stata(stata_file, columns=['year', 'id', 'age', 'sex', 'race'], convert_categoricals=False)
df_subset.head()

Unnamed: 0,year,id,age,sex,race
0,1972,1,23.0,2,1
1,1972,2,70.0,1,1
2,1972,3,48.0,2,1
3,1972,4,27.0,2,1
4,1972,5,61.0,2,1
