# Data Analysis
This is a starter notebook for loading the IPUMS data and perform analyses.

In [29]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## Load IPUMS Dataset

In [31]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "59cba10d8a5da536fc06b59de2965f8fa8d8478dabad9751c577a909"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

In [32]:
# Define Variables
variables = [
    # Tech Households
    "PERSONS",
    # Group Quarters
    "GQ", "GQTYPE", "UNREL",
    # Global Geography
    "URBAN", "POPDENSGEO1", "POPDENSGEO2",
    # National Geography
    "GEO1_MZ", "GEO2_MZ",
    # Household Economic
    "OWNERSHIP",
    # Utilities
    "ELECTRIC", "WATSUP", "PHONE",
    # Appliances
    "AUTOS", "MOTORCYCLE", "BIKE", "COMPUTER", "TV", "RADIO",
    # Dwelling Characteristics
    "ROOMS", "BEDROOMS", "TOILET", "FLOOR", "WALL", "ROOF",
    # Constructed Household
    "HHTYPE", "NFAMS", "NCOUPLES", "NMOTHERS", "NFATHERS",
    # Constructed Family
    "FAMSIZE", "NCHILD", "NCHLT5", "ELDCH", "YNGCH",
    # Demographic
    "RELATE", "AGE", "SEX", "MARST", "CONSENS", 
    # Fertility and Mortality
    "CHBORN", "CHSURV", "CHBORNF", "CHBORNM", "CHSURVF", "CHSURVM", "BIRTHSLYR",
    "BIRTHSURV", "MORTMOT", "MORTFAT",
    # Navity and Birthplace
    "NATIVITY", "CITIZEN", "NATION", "BPL1_MZ", "BPL2_MZ",
    # Ethnicity and Language
    "RELIGION", "RACE", "SPEAKPORT", "LANGMZ", "MTONGMZ", 
    # Education
    "SCHOOL", "LIT", "EDATTAIN",
    # Work
    "EMPSTAT", "LABFORCE", "EMPSECT",
    # Occupation, Industry
    "OCCISCO", "INDGEN",
    # Global Migration - Not giving us any other migration variables,
    # as they would likely be too correlated and uninteresting (AH)
    "MIGRATE1", "MIGRATE5", 
    # Disability
    "DISABLED", "DISEMP", "DISBLND", "DISDEAF", "DISMUTE", "DISLOWR", "DISUPPR",
    "DISMNTL", "DISORIG"
]

In [33]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 2.
Waiting for extract to finish processing on IPUMS server...


KeyboardInterrupt: 

In [None]:
# Load from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [None]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isin(['NIU (not in universe)', 'Unknown/missing'])].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isin(['NIU (not in universe)', 'Unknown/missing'])].copy()

# Binarize 
map_vals = {'Same major, same minor administrative unit':0,
            'Same major administrative unit':0,
            'Same major, different minor administrative unit':0,
            'Different major administrative unit':1,
            'Abroad':0}
mig1_data['mig_provincial'] = mig1_data['MIGRATE1'].map(map_vals)
mig5_data['mig_provincial'] = mig5_data['MIGRATE5'].map(map_vals)

# Remove original migrate vars
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [None]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_provincial
0    5879434
1      50095
Name: count, dtype: int64
mig_provincial
0    4847198
1     127371
Name: count, dtype: int64


## Data Analysis

### Value Counts for Each Set
In this section, we print out the value counts for each column in the sets. This
will allow us to identify useless (i.e., all the same value) or unique (i.e.,
will not generalize well) columns for removal.

In [None]:
for col in mig1_data.columns:
    print(mig1_data[col].value_counts())
    print('-'*30)

COUNTRY
Mozambique    5929529
Name: count, dtype: int64
------------------------------
YEAR
2017    2545554
2007    1931544
1997    1452431
Name: count, dtype: int64
------------------------------
SAMPLE
Mozambique 2017    2545554
Mozambique 2007    1931544
Mozambique 1997    1452431
Name: count, dtype: int64
------------------------------
SERIAL
26886000     58
342270000    56
261828000    52
442445000    50
261007000    50
             ..
572222000     1
515565000     1
614356000     1
614348000     1
478224000     1
Name: count, Length: 612509, dtype: Int64
------------------------------
PERSONS
6     979624
5     956894
4     888129
3     686787
7     641334
8     468220
2     398578
9     314093
10    152814
1     140392
11     99157
12     67139
13     39168
14     26461
15     18694
16     13858
17      9780
18      7793
19      4135
20      3105
21      2806
22      2249
24      1468
23      1098
25      1067
26       769
27       696
28       555
30       340
29       320
31  

In [None]:
for col in mig5_data.columns:
    print(mig5_data[col].value_counts())
    print('-'*30)

COUNTRY
Mozambique    4974569
Name: count, dtype: int64
------------------------------
YEAR
2017    2122246
2007    1617537
1997    1234786
Name: count, dtype: int64
------------------------------
SAMPLE
Mozambique 2017    2122246
Mozambique 2007    1617537
Mozambique 1997    1234786
Name: count, dtype: int64
------------------------------
SERIAL
26886000     59
261007000    49
442445000    49
361255000    48
115823000    47
             ..
590466000     1
590464000     1
590461000     1
590484000     1
590477000     1
Name: count, Length: 609525, dtype: Int64
------------------------------
PERSONS
6     805890
5     777675
4     714101
3     577308
7     534657
8     394412
2     377002
9     266751
1     139068
10    130368
11     84665
12     57273
13     33300
14     22402
15     15811
16     11717
17      8266
18      6541
19      3466
20      2606
21      2334
22      1909
24      1235
23       929
25       871
26       658
27       593
28       468
30       290
29       270
31  

### Crosstabs

In [None]:
len(mig5_data.columns)
mig5_data.columns.tolist()

['COUNTRY',
 'YEAR',
 'SAMPLE',
 'SERIAL',
 'PERSONS',
 'HHWT',
 'GQ',
 'GQTYPE',
 'UNREL',
 'URBAN',
 'POPDENSGEO1',
 'POPDENSGEO2',
 'GEO1_MZ',
 'GEO2_MZ',
 'OWNERSHIP',
 'OWNERSHIPD',
 'ELECTRIC',
 'WATSUP',
 'PHONE',
 'AUTOS',
 'MOTORCYCLE',
 'BIKE',
 'COMPUTER',
 'TV',
 'RADIO',
 'ROOMS',
 'BEDROOMS',
 'TOILET',
 'FLOOR',
 'WALL',
 'ROOF',
 'HHTYPE',
 'NFAMS',
 'NCOUPLES',
 'NMOTHERS',
 'NFATHERS',
 'PERNUM',
 'PERWT',
 'RESIDENT',
 'FAMSIZE',
 'NCHILD',
 'NCHLT5',
 'ELDCH',
 'YNGCH',
 'RELATE',
 'RELATED',
 'AGE',
 'SEX',
 'MARST',
 'MARSTD',
 'CONSENS',
 'CHBORN',
 'CHSURV',
 'CHBORNF',
 'CHBORNM',
 'CHSURVF',
 'CHSURVM',
 'BIRTHSLYR',
 'BIRTHSURV',
 'MORTMOT',
 'MORTFAT',
 'NATIVITY',
 'CITIZEN',
 'NATION',
 'BPL1_MZ',
 'BPL2_MZ',
 'RELIGION',
 'RELIGIOND',
 'RACE',
 'SPEAKPORT',
 'LANGMZ',
 'MTONGMZ',
 'SCHOOL',
 'LIT',
 'EDATTAIN',
 'EDATTAIND',
 'EMPSTAT',
 'EMPSTATD',
 'LABFORCE',
 'EMPSECT',
 'OCCISCO',
 'INDGEN',
 'DISABLED',
 'DISEMP',
 'DISBLND',
 'DISDEAF',
 'DISMUTE',

In [34]:
pd.crosstab(mig5_data['YEAR'], mig5_data['mig_provincial'], normalize='index')
# max_diff = abs(tab[0] - tab[1]).max()
# print(max_diff)

mig_provincial,0,1
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
1997,0.956966,0.043034
2007,0.974729,0.025271
2017,0.984283,0.015717


In [38]:
useful_features = []
useless_features = []

for col in mig5_data.columns:
    if col == 'mig_provincial':
        continue
    if mig5_data[col].nunique() > 20:
        continue

    tab = pd.crosstab(mig5_data[col], mig5_data['mig_provincial'], normalize='index')
    max_diff = tab[1].max() - tab[1].min()
    mean_rate = tab[1].mean()
    if max_diff >= 0.02:
        useful_features.append({'feature': col, 'max_diff': round(max_diff, 4), 'mean_rate': round(mean_rate, 4)})
    else:
        useless_features.append({'feature': col, 'max_diff': round(max_diff, 4), 'mean_rate': round(mean_rate, 4)})

useful_df = pd.DataFrame(useful_features).sort_values(by="max_diff", ascending=False).reset_index(drop=True)
useless_df = pd.DataFrame(useless_features).sort_values(by="max_diff", ascending=False).reset_index(drop=True)

print("Top useful variables:")
print(useful_df.head(10))

print("\nSample of less useful variables:")
print(useless_df.head(10))



Top useful variables:
    feature  max_diff  mean_rate
0    GQTYPE    0.5333     0.2149
1        GQ    0.3215     0.1705
2     NFAMS    0.2800     0.1243
3     UNREL    0.2787     0.1193
4  EMPSTATD    0.2596     0.0464
5      HHWT    0.2537     0.0591
6      BIKE    0.2387     0.0951
7    TOILET    0.2386     0.0667
8      ROOF    0.2386     0.0672
9      WALL    0.2386     0.0528

Sample of less useful variables:
    feature  max_diff  mean_rate
0    NCHILD    0.0193     0.0175
1  DISABLED    0.0164     0.0184
2   DISMNTL    0.0163     0.0189
3  LABFORCE    0.0158     0.0283
4    MARSTD    0.0157     0.0247
5  NATIVITY    0.0157     0.0182
6   CONSENS    0.0157     0.0259
7    DISEMP    0.0153     0.0241
8     MARST    0.0144     0.0257
9  RESIDENT    0.0138     0.0320


In [39]:
pd.crosstab(mig5_data['GQTYPE'], mig5_data['mig_provincial'], normalize='index')

mig_provincial,0,1
GQTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
1-person unit created by splitting large household,0.74592,0.25408
Boarding school or student housing,0.681303,0.318697
Floating population,0.918558,0.081442
"Hospital, nursing home, hospice, or instutions for persons with disabilities",0.950943,0.049057
"Hotel, pension, lodging, or boarding house",0.772059,0.227941
Military or police institution,0.434211,0.565789
NIU (not in universe),0.967511,0.032489
Other group quarters,0.859447,0.140553
"Prisons, reformatories, or correctional institutions",0.564516,0.435484
"Religious institution, monastery, seminary, or convent",0.842105,0.157895


In [40]:
pd.crosstab(mig5_data['GQ'], mig5_data['mig_provincial'], normalize='index')

mig_provincial,0,1
GQ,Unnamed: 1_level_1,Unnamed: 2_level_1
1-person unit created by splitting large household,0.74592,0.25408
Households,0.974677,0.025323
Institutions,0.653176,0.346824
Other group quarters,0.874257,0.125743
Unknown/group quarters not identified,0.899493,0.100507


### Process Migration Response