# Predictor Processing

In [1]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## New Data Extract

In [None]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "key"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

variables = ['PERSONS', 'GQ', 'URBAN', 'GEO1_MZ', 'GEO2_MZ', 'OWNERSHIP',
             'PHONE', 'AUTOS', 'ROOMS', 'HHTYPE', 'RESIDENT', 'FAMSIZE',
             'NCHILD', 'AGE', 'SEX', 'MARST', 'MORTMOT', 'MORTFAT',
             'NATIVITY', 'CITIZEN', 'BPL1_MZ', 'SCHOOL', 'LIT',
             'EDATTAIN', 'EMPSTAT', 'LABFORCE', 'MIGRATE1', 'MIGRATE5']

In [None]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 9.
Waiting for extract to finish processing on IPUMS server...


In [None]:
# Load Data from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [None]:
# Transform to fix NIU, unknown and other issues
var_dicts = ''
with open('var_dictionaries.txt', encoding='utf-8') as f:
    exec("var_dicts = " + f.read())

for v in var_dicts.keys():
    ipums_df_pkl[v] = ipums_df_pkl[v].map(lambda x: var_dicts[v].get(x, x))

In [None]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isna()].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isna()].copy()

# Make new variance for prediction
mig1_data['mig_provincial'] = mig1_data['MIGRATE1']
mig5_data['mig_provincial'] = mig5_data['MIGRATE5']

# Rename columns
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [None]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_provincial
0.0    5860462
1.0      69067
Name: count, dtype: int64
mig_provincial
0.0    4746396
1.0     228173
Name: count, dtype: int64


## Drop Columns (update extract later)

In [None]:
# Drop Metadata Columns
metadata_cols = ['COUNTRY', 'SAMPLE', 'SERIAL', 'HHWT', 'PERNUM', 'PERWT']
mig1_data.drop(columns=metadata_cols, inplace=True)
mig5_data.drop(columns=metadata_cols, inplace=True)