In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))

In [None]:
!ls data/

In [None]:
!ls sup_info/

## Functions

In [None]:
# created append_anomaly_counts function to get anomaly counts with different datasets & slices
def append_anomaly_counts(df_info=None, dataframe=None, col_suffix=""):
    res = []
    df_info["nunique"] = dataframe.nunique().values
    df_info["uknown_count"] = dataframe.eq("?").sum().values
    for idx, row in df_info.iterrows():
        column = row["column_name"]
        if row["column_type"] == "continuous":
            dataframe[column] = dataframe[column].astype(float)
            res.append({"column_name":column, "NIU_count":0})
        else:
            dataframe[column] = dataframe[column].astype(str)
            dataframe_NIU_mask = dataframe[column].str.contains("Not in universe")
            if dataframe_NIU_mask.any():
                dataframe_NIU_uval = dataframe[column][dataframe_NIU_mask].unique().item()
                NIU_count = dataframe[dataframe[column] == dataframe_NIU_uval].shape[0]
                res.append({"column_name":column, "NIU_count":NIU_count})
            else:
                res.append({"column_name":column, "NIU_count":0})
    # matching nominal column sanity check
    if (df_info[df_info["column_type"]=="nominal"]["column_name"].values == dataframe.select_dtypes(object).columns).all():
        df_info = pd.concat([df_info, pd.DataFrame(res)["NIU_count"]], axis=1)
        df_info = df_info.assign(row_count=dataframe.shape[0])
    return df_info

In [None]:
def change_target_value(dataframe=None):
    dataframe.loc[dataframe["target"] == "- 50000", "target"] = 0
    dataframe.loc[dataframe["target"] == "50000+", "target"] = 1
    return dataframe

## Get input data

In [None]:
df_train = pd.read_csv(
    filepath_or_buffer="data/census_income_learn.csv",
    header=None).drop(24,axis=1)

In [None]:
df_test = pd.read_csv(
    filepath_or_buffer="data/census_income_test.csv",
    header=None).drop(24,axis=1)

In [None]:
data_info = [
    "|   91 distinct values for attribute #0 (age) continuous",
    "|    9 distinct values for attribute #1 (class of worker) nominal",
    "|   52 distinct values for attribute #2 (detailed industry recode) nominal",
    "|   47 distinct values for attribute #3 (detailed occupation recode) nominal",
    "|   17 distinct values for attribute #4 (education) nominal",
    "| 1240 distinct values for attribute #5 (wage per hour) continuous",
    "|    3 distinct values for attribute #6 (enroll in edu inst last wk) nominal",
    "|    7 distinct values for attribute #7 (marital stat) nominal",
    "|   24 distinct values for attribute #8 (major industry code) nominal",
    "|   15 distinct values for attribute #9 (major occupation code) nominal",
    "|    5 distinct values for attribute #10 (race) nominal",
    "|   10 distinct values for attribute #11 (hispanic origin) nominal",
    "|    2 distinct values for attribute #12 (sex) nominal",
    "|    3 distinct values for attribute #13 (member of a labor union) nominal",
    "|    6 distinct values for attribute #14 (reason for unemployment) nominal",
    "|    8 distinct values for attribute #15 (full or part time employment stat) nominal",
    "|  132 distinct values for attribute #16 (capital gains) continuous",
    "|  113 distinct values for attribute #17 (capital losses) continuous",
    "| 1478 distinct values for attribute #18 (dividends from stocks) continuous",
    "|    6 distinct values for attribute #19 (tax filer stat) nominal",
    "|    6 distinct values for attribute #20 (region of previous residence) nominal",
    "|   51 distinct values for attribute #21 (state of previous residence) nominal",
    "|   38 distinct values for attribute #22 (detailed household and family stat) nominal",
    "|    8 distinct values for attribute #23 (detailed household summary in household) nominal",
    "|   10 distinct values for attribute #24 (migration code-change in msa) nominal",
    "|    9 distinct values for attribute #25 (migration code-change in reg) nominal",
    "|   10 distinct values for attribute #26 (migration code-move within reg) nominal",
    "|    3 distinct values for attribute #27 (live in this house 1 year ago) nominal",
    "|    4 distinct values for attribute #28 (migration prev res in sunbelt) nominal",
    "|    7 distinct values for attribute #29 (num persons worked for employer) continuous",
    "|    5 distinct values for attribute #30 (family members under 18) nominal",
    "|   43 distinct values for attribute #31 (country of birth father) nominal",
    "|   43 distinct values for attribute #32 (country of birth mother) nominal",
    "|   43 distinct values for attribute #33 (country of birth self) nominal",
    "|    5 distinct values for attribute #34 (citizenship) nominal",
    "|    3 distinct values for attribute #35 (own business or self employed) nominal",
    "|    3 distinct values for attribute #36 (fill inc questionnaire for veteran's admin) nominal",
    "|    3 distinct values for attribute #37 (veterans benefits) nominal",
    "|   53 distinct values for attribute #38 (weeks worked in year) continuous",
    "|    2 distinct values for attribute #39 (year) nominal",
]

## Clean data

In [None]:
s_data_info = pd.Series(data_info)\
    .str.replace("|", "")\
    .str.replace("distinct values for attribute #", ",")\
    .str.replace("(", ",")\
    .str.replace(")", ",")\
    .str.replace("'","")\
    .str.strip()
df_data_info = s_data_info.str.split(",", expand=True).drop(1,axis=1)
df_data_info.columns = ["nunique", "column_name", "column_type"]
df_data_info["nunique"] = df_data_info["nunique"].astype(int)
df_data_info.loc[40] = [2, "target", "nominal"]
df_data_info = df_data_info.map(lambda x: x.strip() if isinstance(x, str) else x)

#### train data clean
- df_train1

In [None]:
print(f"inital shape: {df_train.shape}")
print(f"number of dups: {df_train.duplicated().sum()}") # different total than metadata file (46627 vs.46716)
if (df_train.nunique().reset_index(drop=True) == df_data_info["nunique"]).all():
    print("renaming columns\n")
    df_train.columns = df_data_info["column_name"].tolist()
df_train = df_train.map(lambda x: x.strip() if isinstance(x, str) else x)
df_train["target"] = df_train["target"].str.replace(".", "")

# drop duplicate rows
df_train1 = df_train.drop_duplicates(ignore_index=True)
print(f"shape after drop dups: {df_train1.shape}")

# if edu is Children then target < 50k
print("\nfilter Children - target counts")
print(df_train1[df_train1["education"]=="Children"]["target"].value_counts())

df_train1 = df_train1[df_train1["education"]!="Children"].reset_index(drop=True)
print(f"\nshape after drop Children: {df_train1.shape}")
# print(df_train1.duplicated().sum())

df_info_train = append_anomaly_counts(df_info=df_data_info, dataframe=df_train1)

print("\ntarget distribution")
print(df_train1["target"].value_counts())

#### Test data clean
- df_test1

In [None]:
print(f"inital shape: {df_test.shape}")
print(f"number of dups: {df_test.duplicated().sum()}")
print("renaming columns\n")
df_test.columns = df_data_info["column_name"].tolist()
df_test = df_test.map(lambda x: x.strip() if isinstance(x, str) else x)
df_test["target"] = df_test["target"].str.replace(".", "")

# drop duplicate rows
df_test1 = df_test.drop_duplicates(ignore_index=True)
print(f"shape after drop dups: {df_test1.shape}")

# if edu is Children then target < 50k
print("\nfilter Children - target counts")
print(df_test1[df_test1["education"]=="Children"]["target"].value_counts())

df_test1 = df_test1[df_test1["education"]!="Children"].reset_index(drop=True)
print(f"\nshape after drop Children: {df_test1.shape}")
# print(df_test1.duplicated().sum())

df_info_test = append_anomaly_counts(df_info=df_data_info, dataframe=df_test1)

print("\ntarget distribution")
print(df_test1["target"].value_counts())

## EDA

### descriptive statistics

In [None]:
df_train1.describe()

In [None]:
df_train1.describe(include="object")

### plot categorical distributions

In [None]:
cat_cols = df_train1.select_dtypes(include='object')
for col in cat_cols:
    n = df_train1[col].nunique()
    if n <= 22:
        sns.countplot(
            y=col,
            data=df_train1,
            hue=col,
            palette=sns.color_palette(palette="colorblind", n_colors=n),
            legend=False
        )
        plt.show()

### slice target by numerical features

In [None]:
num_cols = df_train1.select_dtypes(float).columns.values
for col in num_cols:
    sns.boxplot(
        y=df_train1['target'].astype('category'),
        hue=df_train1['target'].astype('category'),
        x=col,
        data=df_train1,
        palette=sns.color_palette(palette="colorblind", n_colors=2)
    )
    plt.show()

In [None]:
for col in num_cols:
    df_train1[col].hist(bins=20)
    plt.show()

### slice target by categorical features

In [None]:
# NEEDS REFACTORING
for col in cat_cols:
    if df_train1[col].nunique() <=4:
        display(pd.crosstab(df_train1['target'], df_train1[col], normalize='index'))

In [None]:
for col in cat_cols:
    if df_train1[col].nunique() <= 4:
        g = sns.catplot(x = col, kind='count', col = 'target', data=df_train1, sharey=False)
        g.set_xticklabels(rotation=60)

### Group numerical features (mean) by categorical features

In [None]:
for col in cat_cols:
    if df_train1[col].nunique() <= 3:
        display(df_train1.groupby(col)[num_cols].mean())

### Correlation matrix for numerical features

In [None]:
corr = df_train1.select_dtypes(float).corr()
corr

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(corr, cmap='RdBu_r', annot=True, vmax=1, vmin=-1)
plt.show()

## Model train data

In [None]:
df_train1 = change_target_value(dataframe=df_train1)
df_test1 = change_target_value(dataframe=df_test1)

In [None]:
df_train1["target"].value_counts()

In [None]:
df_test1["target"].value_counts()

In [None]:
df_class0 = df_train1[df_train1["target"] == 0]
df_class1 = df_train1[df_train1["target"] == 1]

In [None]:
df_class0_sample = df_class0.sample(n=df_class1.shape[0], random_state=42, axis=0)
df_train_balance = pd.concat([df_class0_sample, df_class1], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
y = df_train_balance["target"]
X = df_train_balance.drop("target", axis=1)

### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X,y)