In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.compose import make_column_selector as selector

In [2]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100
plt.rcParams["figure.figsize"] = (10,8)
plt.style.use('seaborn-darkgrid')

In [11]:
all_lead_acs_county_mls_raw = pd.read_csv("../data/cutoff_all_lead_mls.csv", low_memory = False, index_col = 0)

In [58]:
all_lead_acs_county_mls_raw["MARKET_STATE_STD"].isna().sum()

18258

In [60]:
np.where(all_lead_acs_county_mls_raw["MARKET_STATE_STD"].isna(),all_lead_acs_county_mls_raw["STATE_STD"],all_lead_acs_county_mls_raw["MARKET_STATE_STD"])

array(['CA', 'AZ', 'CA', ..., 'CA', 'WA', 'CA'], dtype=object)

In [52]:
all_lead_acs_county_mls_raw["AGENT_ROLE_C"].isna().sum() / all_lead_acs_county_mls_raw.shape[0]

0.803400508864747

In [34]:
all_lead_acs_county_mls= all_lead_acs_county_mls_raw.drop(["STATE_STD"], axis = 1)
all_lead_acs_county_mls.rename({"MARKET_STATE_STD":"STATE_STD"},axis=1)["STATE_STD"].nunique()

2

In [12]:


state_acs = pd.read_csv("../data/census/state_acs.csv")
nation_acs = pd.read_csv("../data/census/nation_acs.csv")

In [13]:
def acs_col_impute(df:pd.DataFrame, state_df:pd.DataFrame, nation_df:pd.DataFrame) -> pd.DataFrame:
    
    # state imputation
    for state in df["STATE_STD"].unique():
        state_fillna_dict = {k: v for d in state_df[[i for i in state_df.columns if i.startswith("Percent!!")]][state_df["STATE_STD"] == state].to_dict(orient="records") for k, v in d.items()}
        df[df["STATE_STD"] == state] = df[df["STATE_STD"] == state].fillna(state_fillna_dict)

    # nation imputation
    nation_fillna_dict = {k: v for d in nation_df[[i for i in nation_df.columns if i.startswith("Percent!!")]].to_dict(orient="records") for k, v in d.items()}
    df = df.fillna(nation_fillna_dict)

    return df

In [14]:
def condense_category(col:pd.Series, min_freq=0.01, new_name='Other')-> pd.Series:
# function for shrink the level based on frequency
# do the value count with normalization
# set the cutoff threshold
    series = pd.value_counts(col, normalize= True)
    mask = (series).lt(min_freq)
    return pd.Series(np.where(col.isin(series[mask].index), new_name, col))

In [15]:
def ordinal_encoding(df:pd.DataFrame, col_list:list) -> pd.DataFrame:
    for col in col_list:
        ord_enc = OrdinalEncoder()
        df[col +"_ORD"] = df[col].apply(ord_enc.fit_transform)
    return df

In [61]:
def pipeline_data(df:pd.DataFrame, state_acs: pd.DataFrame, nation_acs: pd.DataFrame) -> pd.DataFrame:

    # to do: fill the value of MARKET STATE_STD with STATE_STD if missing
    
    
    # replace the STATE_STD column with MARKET_STATE_STD
    if "MARKET_STATE_STD" in df.columns:
        df["MARKET_STATE_STD"] = np.where(df["MARKET_STATE_STD"].isna(),df["STATE_STD"],df["MARKET_STATE_STD"])
        df = df.drop(["STATE_STD"], axis = 1)
        df = df.rename({"MARKET_STATE_STD":"STATE_STD"}, axis=1)
    
    # drop the columns with missing value over 80%
    df = df.loc[:, (df.isnull().sum(axis=0) / df.shape[0] < 0.8)]

    # drop column that has more than 100 unique values
    n_unique_col = [col for col in df.select_dtypes("O").columns if df[col].nunique() > 100]
    ## exclude id, for future reference
    n_unique_col.remove("ID")
    df = df.drop(n_unique_col, axis = 1)

    numeric_drop_cols = ['TOTAL_TRANSACTION_COUNT_C', 'TOTAL_LISTING_COUNT_C', 'AVERAGE_CLOSE_PRICE_C', 'BS_COUNT_C', 'LEAD_NUMBER_C', 
                        'MATRIX_UNIQUE_ID_C','PMXAGENT_ID_C',]
    categorical_drop_cols_manual = ["RECORD_TYPE_ID", "STATE", "OWNER_ID","CREATED_BY_ID","LAST_MODIFIED_BY_ID", "MOST_RECENT_CAMPAIGN_C",
                                     "DISQUALIFIED_REASONS_C", "EMAIL_UNKNOWN_DELIVERABILITY_C", "WEB_LEAD_C", "KNOCK_WEBHOOK_2021_C","DO_NOT_CONTACT_C"]
    boolean_drop_cols_manual = ["IS_DELETED","IS_UNREAD_BY_OWNER","DO_NOT_CALL","HAS_OPTED_OUT_OF_FAX","APEX_PROCESSED_C",
                                "KNOCK_WEBHOOK_C", "EMAIL_BOUNCED_C","IS_LEAD_C","PARDOT_REASSIGNMENT_C","DIGITAL_TE_USER_C","CHECKBOX_C","KCA_C"]
    # drop the acs value columns, since national value and state value are imputed
    percent_col = [col for col in df.columns if (col.startswith("Percent!!"))]
    value_percent_col = [col for col in percent_col if ((df[col].values) >=100).any()]


    
    # drop all the selected cols
    df = df.drop(numeric_drop_cols + categorical_drop_cols_manual + boolean_drop_cols_manual + value_percent_col, axis = 1)

    df = acs_col_impute(df, state_acs, nation_acs)


    cat_keep_list = ["LEVEL_VETTING_C"]
    cat_fillna_list = [col for col in df.select_dtypes("O").columns if col not in cat_keep_list]

    #impute cat col missing value with UNK
    for col in cat_fillna_list:
        df[col] = df[col].fillna("UNK")
    
    # specify ordinal variables
    ord_col = ["LEVEL_VETTING_C"]
    cat_col = ["LEAD_SOURCE","STATUS","ROLE_C","STATE_STD"]

   

    # condense number of categorical levels
    for col in cat_col:
        df[col] = condense_category(df[col])

    # ordinal encoding
    df["LEVEL_VETTING_C"]= df["LEVEL_VETTING_C"].fillna(np.nan)
    level_mapping = [ 'Level 3 - Less than 4 listings/yr', 'Level 2 - 4+ listings/yr', "Level 1 - 10+ listings/yr", 'Level Gold 20-29 Listings/yr', 'Level Platinum 30+ Listings/yr']
    encoder = OrdinalEncoder(categories=[level_mapping], handle_unknown = "use_encoded_value", unknown_value= np.nan, encoded_missing_value= -1)
    encoder.fit(df[["LEVEL_VETTING_C"]])
    df["LEVEL_VETTING_C"]= encoder.transform(df[["LEVEL_VETTING_C"]])

    # onehot encoding
    df = pd.get_dummies(df, columns = cat_col)

    return df

In [62]:
all_lead_acs_mls_model = pipeline_data(all_lead_acs_county_mls_raw, state_acs, nation_acs)

# Cleaning

## Missing Value

In [6]:
# drop the columns with missing value over 80%
all_lead_acs_county = all_lead_acs_county_raw.loc[:, (all_lead_acs_county_raw.isnull().sum(axis=0) / all_lead_acs_county_raw.shape[0] < 0.8)]

## Manually Drop columns

In [None]:
n_unique_col = [col for col in all_lead_acs_county.select_dtypes("O").columns if all_lead_acs_county[col].nunique() > 100]
n_unique_col.remove("ID")
all_lead_acs_county = all_lead_acs_county.drop(n_unique_col, axis = 1)

In [3]:
numeric_drop_cols = ['index',
 'TOTAL_TRANSACTION_COUNT_C',
 'TOTAL_LISTING_COUNT_C',
 'AVERAGE_CLOSE_PRICE_C',
 'BS_COUNT_C',
 'LEAD_NUMBER_C',
 'MATRIX_UNIQUE_ID_C',
 'PMXAGENT_ID_C',]

In [2]:
categorical_drop_cols_manual = ["RECORD_TYPE_ID", "STATE", "OWNER_ID","CREATED_BY_ID","LAST_MODIFIED_BY_ID", "MOST_RECENT_CAMPAIGN_C",
"COUNTY_C", "DISQUALIFIED_REASONS_C", "EMAIL_UNKNOWN_DELIVERABILITY_C", "WEB_LEAD_C", "KNOCK_WEBHOOK_2021_C","DO_NOT_CONTACT_C"]

In [None]:
boolean_drop_cols_manual = ["IS_DELETED","IS_UNREAD_BY_OWNER","DO_NOT_CALL","HAS_OPTED_OUT_OF_FAX","APEX_PROCESSED_C",
 "KNOCK_WEBHOOK_C", "EMAIL_BOUNCED_C","IS_LEAD_C","PARDOT_REASSIGNMENT_C","DIGITAL_TE_USER_C","CHECKBOX_C","KCA_C"]

In [17]:
#### drop the acs value columns, since national value and state value are imputed
percent_col = [col for col in all_lead_acs_county.columns if (col.startswith("Percent!!"))]
value_percent_col = [col for col in percent_col if ((all_lead_acs_county[col].values) >=100).any()]

In [4]:
all_lead_acs_county = all_lead_acs_county.drop (numeric_drop_cols + categorical_drop_cols_manual + boolean_drop_cols_manual + value_percent_col, axis = 1)

['index',
 'TOTAL_TRANSACTION_COUNT_C',
 'TOTAL_LISTING_COUNT_C',
 'AVERAGE_CLOSE_PRICE_C',
 'BS_COUNT_C',
 'LEAD_NUMBER_C',
 'MATRIX_UNIQUE_ID_C',
 'PMXAGENT_ID_C',
 'RECORD_TYPE_ID',
 'STATE',
 'OWNER_ID',
 'CREATED_BY_ID',
 'LAST_MODIFIED_BY_ID',
 'MOST_RECENT_CAMPAIGN_C',
 'COUNTY_C',
 'DISQUALIFIED_REASONS_C',
 'EMAIL_UNKNOWN_DELIVERABILITY_C',
 'WEB_LEAD_C',
 'KNOCK_WEBHOOK_2021_C',
 'DO_NOT_CONTACT_C']

## Inpute value

### Numerical columns

In [13]:
# transform the full state name to acronym and for imputing missing value
# using state value to impute first, if no state information, use nation value

In [14]:
state_acs = pd.read_csv("../data/census/state_acs.csv")
nation_acs = pd.read_csv("../data/census/nation_acs.csv")

In [15]:
def acs_col_impute(df:pd.DataFrame, state_df:pd.DataFrame, nation_df:pd.DataFrame) -> pd.DataFrame:
    
    # state imputation
    for state in df["STATE_STD"].unique():
        state_fillna_dict = {k: v for d in state_df[[i for i in state_df.columns if i.startswith("Percent!!")]][state_df["STATE_STD"] == state].to_dict(orient="records") for k, v in d.items()}
        df[df["STATE_STD"] == state] = df[df["STATE_STD"] == state].fillna(state_fillna_dict)

    # nation imputation
    nation_fillna_dict = {k: v for d in nation_df[[i for i in nation_df.columns if i.startswith("Percent!!")]].to_dict(orient="records") for k, v in d.items()}
    df = df.fillna(nation_fillna_dict)

    return df

In [16]:
all_lead_acs_county = acs_col_impute(all_lead_acs_county,state_acs, nation_acs)

### Categorical Columns

In [19]:
# notes
# "RATING" may not be the column for ratings, check out other columns
# "MOST_RECENT_CAMPAIGN_C" should be modified by time
# "LEVEL_VETTING_C","LEAD_RANKING_C" both have level information, but "LEAD_RANKING_C" doesn't contain "level 2" "level 1"
# exclude_col = ["RATING","LEAD_RANKING_C","MOST_RECENT_CAMPAIGN_C"]
# ord_col = ["LEVEL_VETTING_C","LEAD_RANKING_C"]
# cat_col = ["RECORD_TYPE_ID","STATE","COUNTRY","LEAD_SOURCE","STATUS","RATING", "CREATED_BY_ID", "LAST_MODIFIED_BY_ID", "EMAIL_BOUNCED_REASON",
#              "MOST_RECENT_CAMPAIGN_C","EVENT_TYPE_C","COUNTY_C","FA_LOYAL_LEAD_C", "PREFERRED_ESCROW_COMPANY_C", "ROLE_C","AGENT_ROLE_C","WEB_LEAD_C",
#                ]

### Impute Missing Value

In [27]:
cat_keep_list = ["LEVEL_VETTING_C"]
cat_fillna_list = [col for col in all_lead_acs_county.select_dtypes("O").columns if col not in cat_keep_list]
cat_fillna_list

['LEAD_SOURCE', 'STATUS', 'ROLE_C', 'AGENT_ROLE_C', 'STATE_STD']

In [28]:
for col in cat_fillna_list:
    all_lead_acs_county[col] = all_lead_acs_county[col].fillna("UNK")

# Encoding

## Categorical

### Categorical level condense

In [38]:
def condense_category(col:pd.Series, min_freq=0.01, new_name='Other')-> pd.Series:
# function for shrink the level based on frequency
# do the value count with normalization
# set the cutoff threshold
    series = pd.value_counts(col, normalize= True)
    mask = (series).lt(min_freq)
    return pd.Series(np.where(col.isin(series[mask].index), new_name, col))

In [36]:
ord_col = ["LEVEL_VETTING_C"]
cat_col = ["LEAD_SOURCE","STATUS","ROLE_C","AGENT_ROLE_C","STATE_STD"]

In [39]:
for col in cat_col:
    all_lead_acs_county[col] = condense_category(all_lead_acs_county[col])

###  Ordinal Encoding

In [40]:
def ordinal_encoding(df:pd.DataFrame, col_list:list) -> pd.DataFrame:
    for col in col_list:
        ord_enc = OrdinalEncoder()
        df[col +"_ORD"] = df[col].apply(ord_enc.fit_transform)
    return df

In [41]:
all_lead_acs_county["LEVEL_VETTING_C"]= all_lead_acs_county["LEVEL_VETTING_C"].fillna(np.nan)

In [42]:
level_mapping = [ 'Level 3 - Less than 4 listings/yr', 'Level 2 - 4+ listings/yr', "Level 1 - 10+ listings/yr", 'Level Gold 20-29 Listings/yr', 'Level Platinum 30+ Listings/yr']

In [43]:
encoder = OrdinalEncoder(categories=[level_mapping], handle_unknown = "use_encoded_value", unknown_value= np.nan, encoded_missing_value= -1)

In [44]:
encoder.fit(all_lead_acs_county[["LEVEL_VETTING_C"]])

In [45]:
all_lead_acs_county["LEVEL_VETTING_C"]= encoder.transform(all_lead_acs_county[["LEVEL_VETTING_C"]])

In [46]:
# One Hot Encoding

In [47]:
all_lead_acs_county = pd.get_dummies(all_lead_acs_county, columns= cat_col)

In [None]:
# process:
# train: account creation: 07-01-2021 and before, rollback the status, listing (to be validate )
# val: split from train
# test: account creation: 10-01-2021 and before, rollback the status


In [None]:
# response variable creation
# y_train: use lead to check if it's converted
# y_test: use lead_hist to check if it's converted

In [51]:
# all_lead_acs_county.to_csv("../data/model/all_lead_acs_model.csv",index = None)