In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.compose import make_column_selector as selector

In [17]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100
plt.rcParams["figure.figsize"] = (10,8)
plt.style.use('seaborn-darkgrid')

In [18]:
all_lead_acs_county_mls_raw = pd.read_csv("../data/cutoff_all_lead_mls.csv", low_memory = False, index_col = 0)

In [19]:
all_lead_acs_county_mls_raw["MARKET_STATE_STD"].isna().sum()

18291

In [20]:
all_lead_acs_county_mls_raw["MARKET_COUNTY_FULL"].nunique()

62

In [21]:
np.where(all_lead_acs_county_mls_raw["MARKET_STATE_STD"].isna(),all_lead_acs_county_mls_raw["STATE_STD"],all_lead_acs_county_mls_raw["MARKET_STATE_STD"])

array(['WA', 'CA', 'WA', ..., 'CA', 'CA', 'AZ'], dtype=object)

In [22]:
all_lead_acs_county_mls_raw["AGENT_ROLE_C"].isna().sum() / all_lead_acs_county_mls_raw.shape[0]

0.8032086423734279

In [23]:
all_lead_acs_county_mls= all_lead_acs_county_mls_raw.drop(["STATE_STD"], axis = 1)
all_lead_acs_county_mls.rename({"MARKET_STATE_STD":"STATE_STD"},axis=1)["STATE_STD"].nunique()

2

In [24]:
state_acs = pd.read_csv("../data/census/state_acs.csv")
nation_acs = pd.read_csv("../data/census/nation_acs.csv")

In [25]:
def acs_col_impute(df:pd.DataFrame, state_df:pd.DataFrame, nation_df:pd.DataFrame) -> pd.DataFrame:
    
    # state imputation
    for state in df["STATE_STD"].unique():
        state_fillna_dict = {k: v for d in state_df[[i for i in state_df.columns if i.startswith("Percent!!")]][state_df["STATE_STD"] == state].to_dict(orient="records") for k, v in d.items()}
        df[df["STATE_STD"] == state] = df[df["STATE_STD"] == state].fillna(state_fillna_dict)

    # nation imputation
    nation_fillna_dict = {k: v for d in nation_df[[i for i in nation_df.columns if i.startswith("Percent!!")]].to_dict(orient="records") for k, v in d.items()}
    df = df.fillna(nation_fillna_dict)

    return df

In [26]:
def condense_category(col:pd.Series, min_freq=0.01, new_name='Other')-> pd.Series:
# function for shrink the level based on frequency
# do the value count with normalization
# set the cutoff threshold
    series = pd.value_counts(col, normalize= True)
    mask = (series).lt(min_freq)
    return pd.Series(np.where(col.isin(series[mask].index), new_name, col))

In [27]:
def ordinal_encoding(df:pd.DataFrame, col_list:list) -> pd.DataFrame:
    for col in col_list:
        ord_enc = OrdinalEncoder()
        df[col +"_ORD"] = df[col].apply(ord_enc.fit_transform)
    return df

In [32]:
def pipeline_data(df:pd.DataFrame, state_acs: pd.DataFrame, nation_acs: pd.DataFrame) -> pd.DataFrame:

    # fill the value of MARKET STATE_STD with STATE_STD if missing
    # replace the STATE_STD column with MARKET_STATE_STD
    if "MARKET_STATE_STD" in df.columns:
        df["MARKET_STATE_STD"] = np.where(df["MARKET_STATE_STD"].isna(),df["STATE_STD"],df["MARKET_STATE_STD"])
        df = df.drop(["STATE_STD"], axis = 1)
        df = df.rename({"MARKET_STATE_STD":"STATE_STD"}, axis=1)
    
    # drop the columns with missing value over 80%
    df = df.loc[:, (df.isnull().sum(axis=0) / df.shape[0] < 0.8)]

    # drop column that has more than 100 unique values
    n_unique_col = [col for col in df.select_dtypes("O").columns if df[col].nunique() > 100]
    ## exclude id, for future reference
    n_unique_col.remove("ID")
    df = df.drop(n_unique_col, axis = 1)

    numeric_drop_cols = ['TOTAL_TRANSACTION_COUNT_C', 'TOTAL_LISTING_COUNT_C', 'AVERAGE_CLOSE_PRICE_C', 'BS_COUNT_C', 'LEAD_NUMBER_C', 
                        'MATRIX_UNIQUE_ID_C','PMXAGENT_ID_C',]
    categorical_drop_cols_manual = ["RECORD_TYPE_ID", "STATE", "OWNER_ID","CREATED_BY_ID","LAST_MODIFIED_BY_ID", "MOST_RECENT_CAMPAIGN_C",
                                     "DISQUALIFIED_REASONS_C", "EMAIL_UNKNOWN_DELIVERABILITY_C", "WEB_LEAD_C", "KNOCK_WEBHOOK_2021_C",
                                     "DO_NOT_CONTACT_C","JET_CLOSING_C","NO_SHOW_CP_C","MARKET_COUNTY_FULL","Geographic Area Name"]
    boolean_drop_cols_manual = ["IS_DELETED","IS_UNREAD_BY_OWNER","DO_NOT_CALL","HAS_OPTED_OUT_OF_FAX","APEX_PROCESSED_C",
                                "KNOCK_WEBHOOK_C", "EMAIL_BOUNCED_C","IS_LEAD_C","PARDOT_REASSIGNMENT_C","DIGITAL_TE_USER_C","CHECKBOX_C","KCA_C","HAS_OPTED_OUT_OF_EMAIL"]


    # drop the acs value columns, since national value and state value are imputed
    percent_col = [col for col in df.columns if (col.startswith("Percent!!"))]
    value_percent_col = [col for col in percent_col if df[col].gt(100).any()]
    non_value_percent_col = [col for col in percent_col if df[col].lt(100).all()]

    correlated_features = set()
    correlation_matrix = all_lead_acs_county_mls_raw[non_value_percent_col].corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.9:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    correlated_features = list(correlated_features)

    # drop all the selected cols
    df = df.drop(numeric_drop_cols + categorical_drop_cols_manual + boolean_drop_cols_manual + value_percent_col + correlated_features, axis = 1)

    df = acs_col_impute(df, state_acs, nation_acs)


    cat_keep_list = ["LEVEL_VETTING_C"]
    cat_fillna_list = [col for col in df.select_dtypes("O").columns if col not in cat_keep_list]

    #impute cat col missing value with UNK
    for col in cat_fillna_list:
        df[col] = df[col].fillna("UNK")
    
    # specify ordinal variables
    ord_col = ["LEVEL_VETTING_C"]
    cat_col = ["LEAD_SOURCE","STATUS","ROLE_C","STATE_STD"]

    # condense number of categorical levels
    for col in cat_col:
        df[col] = condense_category(df[col])

    # ordinal encoding
    df["LEVEL_VETTING_C"]= df["LEVEL_VETTING_C"].fillna(np.nan)
    level_mapping = [ 'Level 3 - Less than 4 listings/yr', 'Level 2 - 4+ listings/yr', "Level 1 - 10+ listings/yr", 'Level Gold 20-29 Listings/yr', 'Level Platinum 30+ Listings/yr']
    encoder = OrdinalEncoder(categories=[level_mapping], handle_unknown = "use_encoded_value", unknown_value= np.nan, encoded_missing_value= -1)
    encoder.fit(df[["LEVEL_VETTING_C"]])
    df["LEVEL_VETTING_C"]= encoder.transform(df[["LEVEL_VETTING_C"]])

    # onehot encoding
    df = pd.get_dummies(df, columns = cat_col)

    return df

In [33]:
all_lead_acs_mls_model = pipeline_data(all_lead_acs_county_mls_raw, state_acs, nation_acs)

In [35]:
# all_lead_acs_mls_model.to_csv("../data/model/v2_all_lead_acs_mls_model.csv",index = None)