# Data Analysis
This is a starter notebook for loading the IPUMS data and perform analyses.

In [1]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## Load IPUMS Dataset

In [None]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "###"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

In [4]:
# Define Variables
variables = [
    # Tech Households
    "PERSONS",
    # Group Quarters
    "GQ", "GQTYPE", "UNREL",
    # Global Geography
    "URBAN", "POPDENSGEO1", "POPDENSGEO2",
    # National Geography
    "GEO1_MZ", "GEO2_MZ",
    # Household Economic
    "OWNERSHIP",
    # Utilities
    "ELECTRIC", "WATSUP", "PHONE",
    # Appliances
    "AUTOS", "MOTORCYCLE", "BIKE", "COMPUTER", "TV", "RADIO",
    # Dwelling Characteristics
    "ROOMS", "BEDROOMS", "TOILET", "FLOOR", "WALL", "ROOF",
    # Constructed Household
    "HHTYPE", "NFAMS", "NCOUPLES", "NMOTHERS", "NFATHERS",
    # Constructed Family
    "FAMSIZE", "NCHILD", "NCHLT5", "ELDCH", "YNGCH",
    # Demographic
    "RELATE", "AGE", "SEX", "MARST", "CONSENS", 
    # Fertility and Mortality
    "CHBORN", "CHSURV", "CHBORNF", "CHBORNM", "CHSURVF", "CHSURVM", "BIRTHSLYR",
    "BIRTHSURV", "MORTMOT", "MORTFAT",
    # Navity and Birthplace
    "NATIVITY", "CITIZEN", "NATION", "BPL1_MZ", "BPL2_MZ",
    # Ethnicity and Language
    "RELIGION", "RACE", "SPEAKPORT", "LANGMZ", "MTONGMZ", 
    # Education
    "SCHOOL", "LIT", "EDATTAIN",
    # Work
    "EMPSTAT", "LABFORCE", "EMPSECT",
    # Occupation, Industry
    "OCCISCO", "INDGEN",
    # Global Migration - Not giving us any other migration variables,
    # as they would likely be too correlated and uninteresting (AH)
    "MIGRATE1", "MIGRATE5", 
    # Disability
    "DISABLED", "DISEMP", "DISBLND", "DISDEAF", "DISMUTE", "DISLOWR", "DISUPPR",
    "DISMNTL", "DISORIG"
]

In [5]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 17.
Waiting for extract to finish processing on IPUMS server...
Downloading extract to data ...


See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.


Extracting data from extract to DataFrame...
Shape of IPUMS Data Extract: (6283068, 93)
Updating DataFrame with labels...
Saving IPUMS DataFrame to data\ipums_extract.pkl ...
IPUMS dataset extraction complete.


In [6]:
# Load from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [24]:
ipums_df_pkl['MIGRATE5'].value_counts(dropna=False)

MIGRATE5
Same major, same minor administrative unit         4454888
NIU (not in universe)                              1181741
Same major, different minor administrative unit     176081
Different major administrative unit                 127371
Unknown/missing                                     126758
Same major administrative unit                      115427
Abroad                                              100802
Name: count, dtype: int64

In [None]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isin(['NIU (not in universe)', 'Unknown/missing'])].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isin(['NIU (not in universe)', 'Unknown/missing'])].copy()

# Binarize 
map_vals = {'Same major, same minor administrative unit':0, 'Same major administrative unit':0, 'Same major, different minor administrative unit':0, 'Different major administrative unit':1,'Abroad':0}
mig1_data['mig_provincial'] = mig1_data['MIGRATE1'].map(map_vals)
mig5_data['mig_provincial'] = mig5_data['MIGRATE5'].map(map_vals)

# Remove originial migrate vars
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [None]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_var
0    5879434
1      50095
Name: count, dtype: int64
mig_var
0.0    4826203
NaN     977126
1.0     126200
Name: count, dtype: int64
