# Data Analysis
This is a starter notebook for loading the IPUMS data and perform analyses.

In [6]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

True

In [3]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## Load IPUMS Dataset

In [5]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "key"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

In [32]:
# Define Variables
variables = [
    # Tech Households
    "PERSONS",
    # Group Quarters
    "GQ", "GQTYPE", "UNREL",
    # Global Geography
    "URBAN", "POPDENSGEO1", "POPDENSGEO2",
    # National Geography
    "GEO1_MZ", "GEO2_MZ",
    # Household Economic
    "OWNERSHIP",
    # Utilities
    "ELECTRIC", "WATSUP", "PHONE",
    # Appliances
    "AUTOS", "MOTORCYCLE", "BIKE", "COMPUTER", "TV", "RADIO",
    # Dwelling Characteristics
    "ROOMS", "BEDROOMS", "TOILET", "FLOOR", "WALL", "ROOF",
    # Constructed Household
    "HHTYPE", "NFAMS", "NCOUPLES", "NMOTHERS", "NFATHERS",
    # Constructed Family
    "FAMSIZE", "NCHILD", "NCHLT5", "ELDCH", "YNGCH",
    # Demographic
    "RELATE", "AGE", "SEX", "MARST", "CONSENS", 
    # Fertility and Mortality
    "CHBORN", "CHSURV", "CHBORNF", "CHBORNM", "CHSURVF", "CHSURVM", "BIRTHSLYR",
    "BIRTHSURV", "MORTMOT", "MORTFAT",
    # Navity and Birthplace
    "NATIVITY", "CITIZEN", "NATION", "BPL1_MZ", "BPL2_MZ",
    # Ethnicity and Language
    "RELIGION", "RACE", "SPEAKPORT", "LANGMZ", "MTONGMZ", 
    # Education
    "SCHOOL", "LIT", "EDATTAIN",
    # Work
    "EMPSTAT", "LABFORCE", "EMPSECT",
    # Occupation, Industry
    "OCCISCO", "INDGEN",
    # Global Migration - Not giving us any other migration variables,
    # as they would likely be too correlated and uninteresting (AH)
    "MIGRATE1", "MIGRATE5", 
    # Disability
    "DISABLED", "DISEMP", "DISBLND", "DISDEAF", "DISMUTE", "DISLOWR", "DISUPPR",
    "DISMNTL", "DISORIG"
]

In [33]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 2.
Waiting for extract to finish processing on IPUMS server...


KeyboardInterrupt: 

In [6]:
# Load from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [242]:
# Transform to fix NIU, unknown and other issues
var_dicts = ''
with open('var_dictionaries.txt', encoding='utf-8') as f:
    exec("var_dicts = " + f.read())

for v in var_dicts.keys():
    ipums_df_pkl[v] = ipums_df_pkl[v].map(lambda x: var_dicts[v].get(x, x))

In [244]:
# Drop unessecary columnes
ipums_df_pkl.drop(columns=['COUNTRY', 'SAMPLE', 'SERIAL', 'HHWT', 'PERNUM', 'PERWT'], inplace=True)

In [245]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isna()].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isna()].copy()

# Make new variance for prediction
mig1_data['mig_provincial'] = mig1_data['MIGRATE1']
mig5_data['mig_provincial'] = mig5_data['MIGRATE5']

# Rename columns
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [246]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_provincial
0.0    5860462
1.0      69067
Name: count, dtype: int64
mig_provincial
0.0    4746396
1.0     228173
Name: count, dtype: int64


## Data Analysis

### Value Counts for Each Set
In this section, we print out the value counts for each column in the sets. This
will allow us to identify useless (i.e., all the same value) or unique (i.e.,
will not generalize well) columns for removal.

In [247]:
for col in mig1_data.columns:
    print(mig1_data[col].value_counts())
    print('-'*30)

YEAR
2017    2545554
2007    1931544
1997    1452431
Name: count, dtype: int64
------------------------------
PERSONS
6     979624
5     956894
4     888129
3     686787
7     641334
8     468220
2     398578
9     314093
10    152814
1     140392
11     99157
12     67139
13     39168
14     26461
15     18694
16     13858
17      9780
18      7793
19      4135
20      3105
21      2806
22      2249
24      1468
23      1098
25      1067
26       769
27       696
28       555
30       340
29       320
31       313
35       296
32       213
36       198
39       191
34       150
44       123
33        94
47        92
45        90
37        73
50        48
48        48
41        40
38        37
Name: count, dtype: Int64
------------------------------
GQ
Households                                            5921521
Other group quarters                                     2430
1-person unit created by splitting large household       2325
Institutions                                       

In [248]:
for col in mig5_data.columns:
    print(mig5_data[col].value_counts())
    print('-'*30)

YEAR
2017    2122246
2007    1617537
1997    1234786
Name: count, dtype: int64
------------------------------
PERSONS
6     805890
5     777675
4     714101
3     577308
7     534657
8     394412
2     377002
9     266751
1     139068
10    130368
11     84665
12     57273
13     33300
14     22402
15     15811
16     11717
17      8266
18      6541
19      3466
20      2606
21      2334
22      1909
24      1235
23       929
25       871
26       658
27       593
28       468
30       290
29       270
31       265
35       261
39       180
32       180
36       172
34       137
44       108
33        87
47        75
37        67
50        49
45        47
48        41
41        33
38        31
Name: count, dtype: Int64
------------------------------
GQ
Households                                            4967152
1-person unit created by splitting large household       2267
Other group quarters                                     2187
Institutions                                       

### Crosstabs

In [249]:
len(mig5_data.columns)
mig5_data.columns.tolist()

['YEAR',
 'PERSONS',
 'GQ',
 'GQTYPE',
 'UNREL',
 'URBAN',
 'POPDENSGEO1',
 'POPDENSGEO2',
 'GEO1_MZ',
 'GEO2_MZ',
 'OWNERSHIP',
 'OWNERSHIPD',
 'ELECTRIC',
 'WATSUP',
 'PHONE',
 'AUTOS',
 'MOTORCYCLE',
 'BIKE',
 'COMPUTER',
 'TV',
 'RADIO',
 'ROOMS',
 'BEDROOMS',
 'TOILET',
 'FLOOR',
 'WALL',
 'ROOF',
 'HHTYPE',
 'NFAMS',
 'NCOUPLES',
 'NMOTHERS',
 'NFATHERS',
 'RESIDENT',
 'FAMSIZE',
 'NCHILD',
 'NCHLT5',
 'ELDCH',
 'YNGCH',
 'RELATE',
 'RELATED',
 'AGE',
 'SEX',
 'MARST',
 'MARSTD',
 'CONSENS',
 'CHBORN',
 'CHSURV',
 'CHBORNF',
 'CHBORNM',
 'CHSURVF',
 'CHSURVM',
 'BIRTHSLYR',
 'BIRTHSURV',
 'MORTMOT',
 'MORTFAT',
 'NATIVITY',
 'CITIZEN',
 'NATION',
 'BPL1_MZ',
 'BPL2_MZ',
 'RELIGION',
 'RELIGIOND',
 'RACE',
 'SPEAKPORT',
 'LANGMZ',
 'MTONGMZ',
 'SCHOOL',
 'LIT',
 'EDATTAIN',
 'EDATTAIND',
 'EMPSTAT',
 'EMPSTATD',
 'LABFORCE',
 'EMPSECT',
 'OCCISCO',
 'INDGEN',
 'DISABLED',
 'DISEMP',
 'DISBLND',
 'DISDEAF',
 'DISMUTE',
 'DISLOWR',
 'DISUPPR',
 'DISMNTL',
 'DISORIG',
 'mig_provincia

In [250]:
pd.crosstab(mig5_data['YEAR'], mig5_data['mig_provincial'], normalize='index')
# max_diff = abs(tab[0] - tab[1]).max()
# print(max_diff)

mig_provincial,0.0,1.0
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
1997,0.891614,0.108386
2007,0.967314,0.032686
2017,0.98046,0.01954


In [251]:
useful_features = []
useless_features = []

for col in mig5_data.columns:
    if col == 'mig_provincial':
        continue
    if mig5_data[col].nunique() > 20:
        continue

    tab = pd.crosstab(mig5_data[col], mig5_data['mig_provincial'], normalize='index')
    max_diff = tab[1].max() - tab[1].min()
    mean_rate = tab[1].mean()
    if max_diff >= 0.02:
        useful_features.append({'feature': col, 'max_diff': round(max_diff, 4), 'mean_rate': round(mean_rate, 4)})
    else:
        useless_features.append({'feature': col, 'max_diff': round(max_diff, 4), 'mean_rate': round(mean_rate, 4)})

useful_df = pd.DataFrame(useful_features).sort_values(by="max_diff", ascending=False).reset_index(drop=True)
useless_df = pd.DataFrame(useless_features).sort_values(by="max_diff", ascending=False).reset_index(drop=True)

print("Top useful variables:")
print(useful_df.head(10))

print("\nSample of less useful variables:")
print(useless_df.head(10))



Top useful variables:
    feature  max_diff  mean_rate
0    GQTYPE    0.4828     0.2988
1      RACE    0.4541     0.2005
2   BPL1_MZ    0.3662     0.0748
3  NATIVITY    0.3449     0.2127
4        GQ    0.3389     0.2158
5   CITIZEN    0.3150     0.2015
6     NFAMS    0.3078     0.1626
7     UNREL    0.3061     0.1563
8  EMPSTATD    0.2559     0.0701
9     ROOMS    0.2500     0.0840

Sample of less useful variables:
     feature  max_diff  mean_rate
0  RELIGIOND    0.0199     0.0256
1     SCHOOL    0.0196     0.0457
2   RELIGION    0.0157     0.0248
3     DISEMP    0.0135     0.0445
4       BIKE    0.0133     0.0230
5     MARSTD    0.0125     0.0439
6    DISMUTE    0.0122     0.0193
7  SPEAKPORT    0.0115     0.0462
8   ELECTRIC    0.0110     0.0486
9    DISMNTL    0.0110     0.0514


In [252]:
pd.crosstab(mig5_data['GQTYPE'], mig5_data['mig_provincial'], normalize='index')

mig_provincial,0.0,1.0
GQTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
1-person unit created by splitting large household,0.732245,0.267755
Boarding school or student housing,0.627479,0.372521
Floating population,0.870494,0.129506
"Hospital, nursing home, hospice, or instutions for persons with disabilities",0.913208,0.086792
"Hotel, pension, lodging, or boarding house",0.617647,0.382353
Military or police institution,0.430451,0.569549
Other group quarters,0.836406,0.163594
"Prisons, reformatories, or correctional institutions",0.5,0.5
"Religious institution, monastery, seminary, or convent",0.782895,0.217105


In [253]:
pd.crosstab(mig5_data['GQ'], mig5_data['mig_provincial'], normalize='index')

mig_provincial,0.0,1.0
GQ,Unnamed: 1_level_1,Unnamed: 2_level_1
1-person unit created by splitting large household,0.732245,0.267755
Households,0.954436,0.045564
Institutions,0.615514,0.384486
Other group quarters,0.834476,0.165524


### Process Migration Response