# Association Analysis
This notebook contains work done for association analysis on the Mozambique
dataset from IPUMS.

Don't attempt to run this file, it takes around 24 hours and may encounter memory issues. The final csvs are in the shared folder and I have pickle files of all the intermediate data structures if we need them. - Arlan

In [None]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
from efficient_apriori import apriori # https://pypi.org/project/efficient-apriori/

# Load Custom Scripts
from src.utils.ipums_extract import (
    load_ipums_from_pkl,
)

PKL_PATH = Path(r"data/mozambique.pkl")

In [8]:
# Load from PKL
mig1_data, mig5_data = load_ipums_from_pkl(PKL_PATH)

print(mig1_data.shape)
print(mig5_data.shape)

(5929529, 66)
(4974569, 66)


In [None]:
# Only use records where migration is 1 for itemset building, bring back others when calculating rule metrics
mig1_data_filtered = mig1_data[mig1_data['MIGRATE1'] == 1].drop('MIGRATE1', axis=1).copy()
mig5_data_filtered = mig5_data[mig5_data['MIGRATE5'] == 1].drop('MIGRATE5', axis=1).copy()

print(mig1_data_filtered.shape)
print(mig5_data_filtered.shape)

(69067, 65)
(228173, 65)


In [None]:
# For the data as nested list of strings for apriori algorithm
transactions1 = []
for index, row in mig1_data_filtered.iterrows():
    transaction_items = tuple(row.index[row == 1].tolist())
    transactions1.append(transaction_items)

transactions5 = []
for index, row in mig5_data_filtered.iterrows():
    transaction_items = tuple(row.index[row == 1].tolist())
    transactions5.append(transaction_items)

In [74]:
# Apriori for mig1
itemsets1, rules1 = apriori(transactions1, min_support=0.1)

In [None]:
# Apriori for mig5 - first part due to memory limitations
itemsets5a, rules5a = apriori(transactions5[:114000], min_support=0.1)

In [None]:
# Apriori for mig5 - second part due to memory limitations
itemsets5b, rules5b = apriori(transactions5[114000:], min_support=0.1)

In [None]:
# Compile list of rules and metrics for {x} -> MIGRATE1
results1 = []
col_avg = mig1_data.mean()
total_count = len(mig1_data)
mig_count = len(mig1_data_filtered)

counter = 0

for itemset_size, list_of_itemsets in itemsets1.items():
    for itemset, support in list_of_itemsets.items():
        itemset = list(itemset)
        if counter % 1000 == 0:
            print(counter)
        count_itemset = (mig1_data[itemset] == 1).all(axis=1).sum()
        confidence = support / count_itemset
        lift = confidence / (mig_count / total_count)
        interest = (support / mig_count) / (np.prod([col_avg[col] for col in list(itemset)]) * col_avg['MIGRATE1'])

        results1.append([itemset, support, confidence, lift, interest])
        counter += 1

# Write to csv
results1_df = pd.DataFrame(results1, columns=['itemset', 'support', 'confidence', 'lift', 'interest'])
results1_df.to_csv('mig5_final_analysis.csv')

In [None]:
# Compile list of rules and metrics for {x} -> MIGRATE5
# To avoid crashes and memory issues intermediate results are written out every 10k rules
results5 = []
col_avg = mig5_data.mean()
total_count = len(mig5_data)
mig_count = len(mig5_data_filtered)

setA = {itemset for itemset_size, list_of_itemsets in itemsets5a.items() for itemset, support in list_of_itemsets.items()}
setB = {itemset for itemset_size, list_of_itemsets in itemsets5b.items() for itemset, support in list_of_itemsets.items()}
itemsets = setA | setB
print(len(itemsets))

counter = 0

for itemset in itemsets:
    itemset = list(itemset)
    if counter % 1000 == 0:
        print(counter)

    support = (mig5_data_filtered[itemset] == 1).all(axis=1).sum()
    count_itemset = (mig5_data[itemset] == 1).all(axis=1).sum()
    confidence = support / count_itemset
    lift = confidence / (mig_count / total_count)
    interest = (support / mig_count) / (np.prod([col_avg[col] for col in list(itemset)]) * col_avg['MIGRATE5'])

    results5.append([itemset, support, confidence, lift, interest])
    counter += 1

    if counter > 0 and counter % 10000 == 0:
        results5_df = pd.DataFrame(results5, columns=['itemset', 'support', 'confidence', 'lift', 'interest'])
        results5_df.to_csv(f'aa_mig5_portion{counter}.csv')
        results5 = []


results5_df = pd.DataFrame(results5, columns=['itemset', 'support', 'confidence', 'lift', 'interest'])
results5_df.to_csv(f'aa_mig5_portion{counter}.csv')
results5 = []

In [None]:
# combine all files into a single csv
all_files = [f'aa_mig5_portion{i*10000}.csv' for i in range(1,14)] + ['aa_mig5_portion137134.csv']
df_list = []
output_csv_path = 'mig5_final_analysis.csv'

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)
combined_df.to_csv(output_csv_path, index=False)

Successfully combined 14 CSV files into mig5_final_analysis.csv
