# Data Analysis
This is a starter notebook for loading the IPUMS data and perform analyses.

In [6]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## Load IPUMS Dataset

In [None]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "###"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

In [9]:
# Define Variables
variables = [
    # Tech Households
    "PERSONS",
    # Group Quarters
    "GQ", "GQTYPE", "UNREL",
    # Global Geography
    "URBAN", "POPDENSGEO1", "POPDENSGEO2",
    # National Geography
    "GEO1_MZ", "GEO2_MZ",
    # Household Economic
    "OWNERSHIP",
    # Utilities
    "ELECTRIC", "WATSUP", "PHONE",
    # Appliances
    "AUTOS", "MOTORCYCLE", "BIKE", "COMPUTER", "TV", "RADIO",
    # Dwelling Characteristics
    "ROOMS", "BEDROOMS", "TOILET", "FLOOR", "WALL", "ROOF",
    # Constructed Household
    "HHTYPE", "NFAMS", "NCOUPLES", "NMOTHERS", "NFATHERS",
    # Constructed Family
    "FAMSIZE", "NCHILD", "NCHLT5", "ELDCH", "YNGCH",
    # Demographic
    "RELATE", "AGE", "SEX", "MARST", "CONSENS", 
    # Fertility and Mortality
    "CHBORN", "CHSURV", "CHBORNF", "CHBORNM", "CHSURVF", "CHSURVM", "BIRTHSLYR",
    "BIRTHSURV", "MORTMOT", "MORTFAT",
    # Navity and Birthplace
    "NATIVITY", "CITIZEN", "NATION", "BPL1_MZ", "BPL2_MZ",
    # Ethnicity and Language
    "RELIGION", "RACE", "SPEAKPORT", "LANGMZ", "MTONGMZ", 
    # Education
    "SCHOOL", "LIT", "EDATTAIN",
    # Work
    "EMPSTAT", "LABFORCE", "EMPSECT",
    # Occupation, Industry
    "OCCISCO", "INDGEN",
    # Global Migration - Not giving us any other migration variables,
    # as they would likely be too correlated and uninteresting (AH)
    "MIGRATE1", "MIGRATE5", 
    # Disability
    "DISABLED", "DISEMP", "DISBLND", "DISDEAF", "DISMUTE", "DISLOWR", "DISUPPR",
    "DISMNTL", "DISORIG"
]

In [10]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 14.
Waiting for extract to finish processing on IPUMS server...
Downloading extract to data ...


See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.


Extracting data from extract to DataFrame...
Shape of IPUMS Data Extract: (6283068, 93)
Saving IPUMS DataFrame to data\ipums_extract.pkl ...
IPUMS dataset extraction complete.


In [11]:
# Load from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [18]:
ipums_df_pkl

Unnamed: 0,COUNTRY,YEAR,SAMPLE,SERIAL,PERSONS,HHWT,GQ,GQTYPE,UNREL,URBAN,...,MIGRATE5,DISABLED,DISEMP,DISBLND,DISDEAF,DISMUTE,DISLOWR,DISUPPR,DISMNTL,DISORIG
0,508,1997,508199701,1000,5,10.0,10,999,0,2,...,20,2,2,,,,,,2,
1,508,1997,508199701,1000,5,10.0,10,999,0,2,...,11,2,2,,,,,,2,
2,508,1997,508199701,1000,5,10.0,10,999,0,2,...,11,2,9,,,,,,2,
3,508,1997,508199701,1000,5,10.0,10,999,0,2,...,11,2,2,,,,,,2,
4,508,1997,508199701,1000,5,10.0,10,999,0,2,...,11,2,2,,,,,,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283063,508,2017,508201701,614421000,3,10.0,10,,0,2,...,11,2,2,2,2,2,2,2,2,90
6283064,508,2017,508201701,614421000,3,10.0,10,,0,2,...,0,2,9,2,2,2,2,2,2,90
6283065,508,2017,508201701,614422000,3,10.0,10,,0,2,...,11,2,2,2,2,2,2,2,2,90
6283066,508,2017,508201701,614422000,3,10.0,10,,0,2,...,11,2,2,2,2,2,2,2,2,90
