In [1]:
import os
import pickle
from ucimlrepo import fetch_ucirepo

def fetch_census_data():
    local_file_path = './data/census_data.pkl'
    
    if not os.path.exists(local_file_path):
        # Fetch dataset from UCIMLRepo
        dataset = fetch_ucirepo(id=117)
        
        if dataset:
            # Save the dataset to a local file using pickle
            with open(local_file_path, 'wb') as file:
                pickle.dump(dataset, file)
            print("Data saved to:", local_file_path)
        else:
            print("Failed to fetch data from UCIMLRepo.")
            return None
    else:
        # Load the dataset from the local file using pickle
        with open(local_file_path, 'rb') as file:
            dataset = pickle.load(file)
    
    return dataset

# Example usage
dataset = fetch_census_data()
if dataset is not None:
    print("Dataset loaded successfully.")
    print("Features DataFrame head:")
    print(dataset.data.features.shape)
    

Data saved to: ./data/census_data.pkl
Dataset loaded successfully.
Features DataFrame head:
(199523, 41)


In [4]:
print(dataset.keys())
dataset['data'].keys()


dict_keys(['data', 'metadata', 'variables'])


dict_keys(['ids', 'features', 'targets', 'original', 'headers'])

In [6]:
dataset['variables']

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,AAGE,Feature,Integer,Age,age,,no
1,ACLSWKR,Feature,Categorical,,class of worker,,no
2,ADTINK,Feature,Integer,,industry code,,no
3,ADTOCC,Feature,Integer,Occupation,occupation code,,no
4,AHGA,Feature,Integer,Education Level,education,,no
5,AHSCOL,Feature,Categorical,Education Level,enrolled in edu last week,,no
6,AMARITL,Feature,Categorical,Marital Status,marital status,,no
7,AMJIND,Feature,Categorical,,major industry code,,no
8,AMJOCC,Feature,Categorical,Occupation,major occupation code,,no
9,ARACE,Feature,Categorical,Race,race,,no


In [8]:
dataset['data']['original']

Unnamed: 0,AAGE,ACLSWKR,ADTINK,ADTOCC,AHGA,AHRSPAY,AHSCOL,AMARITL,AMJIND,AMJOCC,...,PEFNTVTY,PEMNTVTY,PENATVTY,PRCITSHP,SEOTR,VETQVA,VETYN,WKSWORK,year,income
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,-50000
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,-50000
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,-50000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199518,87,Not in universe,0,0,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,Canada,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000
199519,65,Self-employed-incorporated,37,2,11th grade,0,Not in universe,Married-civilian spouse present,Business and repair services,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,-50000
199520,47,Not in universe,0,0,Some college but no degree,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,...,Poland,Poland,Germany,Foreign born- U S citizen by naturalization,0,Not in universe,2,52,95,-50000
199521,16,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,-50000


In [12]:
y = dataset['data']['targets']
x = dataset['data']['features']
x.describe(include='all')

Unnamed: 0,AAGE,ACLSWKR,ADTINK,ADTOCC,AHGA,AHSCOL,AMARITL,AMJIND,AMJOCC,ARACE,...,PEFNTVTY,PEMNTVTY,PENATVTY,PRCITSHP,SEOTR,VETQVA,VETYN,WKSWORK,AHRSPAY,year
count,199523.0,199523,199523.0,199523.0,199523,199523,199523,199523,199523,199523,...,192810,193404,196130,199523,199523.0,199523,199523.0,199523.0,199523.0,199523.0
unique,,9,,,17,3,7,24,15,5,...,42,42,42,5,,3,,,,
top,,Not in universe,,,High school graduate,Not in universe,Never married,Not in universe or children,Not in universe,White,...,United-States,United-States,United-States,Native- Born in the United States,,Not in universe,,,,
freq,,100245,,,48407,186943,86485,100684,100684,167365,...,159163,160479,176989,176992,,197539,,,,
mean,34.494199,,15.35232,11.306556,,,,,,,...,,,,,0.175438,,1.514833,23.174897,55.426908,94.499672
std,22.310895,,18.067129,14.454204,,,,,,,...,,,,,0.553694,,0.851473,24.411488,274.896454,0.500001
min,0.0,,0.0,0.0,,,,,,,...,,,,,0.0,,0.0,0.0,0.0,94.0
25%,15.0,,0.0,0.0,,,,,,,...,,,,,0.0,,2.0,0.0,0.0,94.0
50%,33.0,,0.0,0.0,,,,,,,...,,,,,0.0,,2.0,8.0,0.0,94.0
75%,50.0,,33.0,26.0,,,,,,,...,,,,,0.0,,2.0,52.0,0.0,95.0


In [13]:
x['AHGA'].value_counts()

AHGA
High school graduate                      48407
Children                                  47422
Some college but no degree                27820
Bachelors degree(BA AB BS)                19865
7th and 8th grade                          8007
10th grade                                 7557
11th grade                                 6876
Masters degree(MA MS MEng MEd MSW MBA)     6541
9th grade                                  6230
Associates degree-occup /vocational        5358
Associates degree-academic program         4363
5th or 6th grade                           3277
12th grade no diploma                      2126
1st 2nd 3rd or 4th grade                   1799
Prof school degree (MD DDS DVM LLB JD)     1793
Doctorate degree(PhD EdD)                  1263
Less than 1st grade                         819
Name: count, dtype: int64