In [1]:
# Imports
from scipy.stats import spearmanr
from itertools import combinations
from datetime import datetime

import pandas as pd

In [48]:
def is_dependent(df, attr1, attr2, threshold=0.8):
    X, Y = df[attr1], df[attr2]
    corr, pvalue = spearmanr(X, Y)
    
    # Attributes are likely dependent if >= threshold
    return abs(corr) >= threshold
    

In [2]:
# Read from data source
DATA_SOURCE = r'../data/census_sanitized.csv'
df = pd.read_csv(DATA_SOURCE)

# Read date as datetime object using MM/DD/YYYY format, convert to timestamp
df['date-timestamp'] = df['date'].apply(
    lambda x: datetime.strptime(x, '%m/%d/%Y').timestamp()
)

# We change 'date' to 'date-timestamp' so that the date can be 
# considered a continious number
nonnominal_attributes = ['date-timestamp',
                         'population-wgt',
                         'education-num',
                         'capital-gain',
                         'capital-loss']

In [49]:
# Iterate through combinations, determine dependence
for c in combinations(nonnominal_attributes, 2):
    X, Y = df[c[0]], df[c[1]]
    print(f'{str(c[0]) + " & " + str(c[1]):<35}: {is_dependent(df, *c)}')



date-timestamp & population-wgt    : False
date-timestamp & education-num     : False
date-timestamp & capital-gain      : False
date-timestamp & capital-loss      : False
population-wgt & education-num     : False
population-wgt & capital-gain      : False
population-wgt & capital-loss      : False
education-num & capital-gain       : False
education-num & capital-loss       : False
capital-gain & capital-loss        : False


In [35]:
from datetime import datetime
d = df['date'][0]

print(d)

05/01/1994


In [37]:
dt = datetime.strptime(d, '%m/%d/%Y')

In [40]:
type(dt.timestamp())

float

In [42]:
df['date']

0        05/01/1994
1        02/14/1994
2        01/21/1994
3        01/21/1994
4        01/21/1994
            ...    
32556    02/14/1994
32557    05/01/1994
32558    02/14/1994
32559    02/14/1994
32560    02/14/1994
Name: date, Length: 32561, dtype: object

In [44]:
df

Unnamed: 0.1,Unnamed: 0,date,age,workclass,population-wgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,over-under-50k,date-timestamp
0,0,05/01/1994,3,State-gov,0.044302,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,1,United-States,<=50K,767768400.0
1,1,02/14/1994,4,Self-emp-not-inc,0.048238,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K,761205600.0
2,2,01/21/1994,2,Private,0.138113,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,1,United-States,<=50K,759132000.0
3,3,01/21/1994,4,Private,0.151068,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,1,United-States,<=50K,759132000.0
4,4,01/21/1994,1,Private,0.221488,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,1,Cuba,<=50K,759132000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,32556,02/14/1994,1,Private,0.166404,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,1,United-States,<=50K,761205600.0
32557,32557,05/01/1994,3,Private,0.096500,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,1,United-States,>50K,767768400.0
32558,32558,02/14/1994,5,Private,0.094827,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,1,United-States,<=50K,761205600.0
32559,32559,02/14/1994,0,Private,0.128499,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,0,United-States,<=50K,761205600.0
