In [None]:
import pandas as pd

In [None]:
# Check Differences in Two Lists
def list_membership(list1, list2):
    print("In List 1 but not List 2:")
    list1_notlist2 = list(set(list1) - set(list2))
    print("")
    print("In List 2 not in List 1")
    list2_notlist1 = list(set(list2) - set(list1))
    print("")
    output_list = [list1_notlist2, list2_notlist1]
    return output_list

In [None]:
# Compare Two Datasets for the Specified Columns
# Requires ID columns and columns to be compared to have the same names
def compare_datasets(df1, df2, list_id_cols, list_cols_compare, 
                     df1_str_cols, df1_num_cols, 
                     df2_str_cols, df2_num_cols,
                     df1_date_cols, df2_date_cols,
                     df1_suffix, df2_suffix):
    
    output_list_comparisons = []

    # First Coerce Types to Reduce Ineffective Joins
    # Strings
    for str_col in df1_str_cols:
        df1[str_col] = df1[str_col].astype(str)

    for str_col in df2_str_cols:
        df2[str_col] = df2[str_col].astype(str)
    # Numerics
    df1[df1_num_cols] = df1[df1_num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
    df2[df2_num_cols] = df2[df2_num_cols].apply(pd.to_numeric, errors='coerce', axis=1)
    # Dates
    df1[df1_date_cols] = df1[df1_date_cols].apply(pd.to_datetime, errors='coerce')
    df2[df2_date_cols] = df2[df2_date_cols].apply(pd.to_datetime, errors='coerce')

    for col_compare in list_cols_compare:
        print("Comparing " + col_compare + " now.")
        id_col_and_col_compare_list = []
        for id in list_id_cols:
            id_col_and_col_compare_list.append(id)
        id_col_and_col_compare_list.append(col_compare)
        
        df1_short = df1[id_col_and_col_compare_list]
        df2_short = df2[id_col_and_col_compare_list]

        df_join_1_2 = df1_short.merge(df2_short, how = "left", on = list_id_cols, suffixes=(df1_suffix, df2_suffix))
        df_join_2_1 = df2_short.merge(df1_short, how = "left", on = list_id_cols, suffixes=(df2_suffix, df1_suffix))

        df_join_1_2["match"] = df_join_1_2[(col_compare + df1_suffix)] == df_join_1_2[(col_compare + df2_suffix)]
        df_join_2_1["match"] = df_join_2_1[(col_compare + df2_suffix)] == df_join_2_1[(col_compare + df1_suffix)]

        output_list_comparisons.append(df_join_1_2)
        output_list_comparisons.append(df_join_2_1)

        df_join_1_2_mismatch = df_join_1_2[df_join_1_2["match"] == 0]
        df_join_2_1_mismatch = df_join_2_1[df_join_2_1["match"] == 0]

        output_list_comparisons.append(df_join_1_2_mismatch)
        output_list_comparisons.append(df_join_2_1_mismatch)
        print("")

    print("Function Completed.")
    return output_list_comparisons

In [None]:
# import datasets
sfdc_cs = pd.read_csv("sfdc_cs.csv")
sfdc_hh = pd.read_csv("sfdc_hh.csv")
sfl_cs = pd.read_csv("sfl_cs.csv")
sfl_hh = pd.read_csv("sfl_hh.csv")

In [None]:
for i in [sfdc_cs, sfdc_hh, sfl_cs, sfl_hh]:
    print(i.shape)

In [None]:
# Renaming / formatting
sfl_cs = sfl_cs[["SURVEY_DATE", "FOLLOW_UP_STATUS_C", "SURVEY_ID", "LEAD_ADVISOR_C", "TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"]]
sfdc_cs.rename(columns = {'CreatedDate': 'SURVEY_DATE', 'Follow_Up_Status__c': 'FOLLOW_UP_STATUS_C', 'Id': 'SURVEY_ID', 'Lead_Advisor__c': 'LEAD_ADVISOR_C',
       'Trust_Score__c': "TRUST_SCORE_C", 'X2016_Q2__c': 'SATISFACTION_SCORE', 'X2016_Q5__c': 'LIKELY_TO_RECOMMEND'}, inplace = True)

sfdc_hh.rename(columns = {'Id': 'ID', 'Name': 'HOUSEHOLD_NAME', 'BeWELLthy_Intro_Date__c': 'BE_WELLTHY_INTRO_DATE_C', 'AnnualRevenue': 'CALCULATED_ANNUAL_REVENUE', 'Value_Stack__c': 'VALUE_STACK_C', 
                          'Client_Tier__c': 'CLIENT_TIER_C', 'Client_Engagement_Rank__c': 'CLIENT_ENGAGEMENT_RANK_C', 'Days_Since_Last_Client_Review__c': 'DSLCR', 'Days_Last_Contact__c': 'DSLC', 
                          'Associate_Advisor_Lookup__c': 'ASSOCIATE_ADVISOR', 'Lead_Advisor_Lookup__c': 'LEAD_ADVISOR', 'Client_Service_Coordinator_Lookup__c': 'ANALYST', 'Wealth_Start_Date__c': 'WEALTH_START_DATE_C',}, inplace = True)
sfdc_hh = sfdc_hh[['ID', 'HOUSEHOLD_NAME', 'BE_WELLTHY_INTRO_DATE_C',
       'CALCULATED_ANNUAL_REVENUE', 'VALUE_STACK_C', 'CLIENT_TIER_C',
       'CLIENT_ENGAGEMENT_RANK_C', 'DSLCR', 'DSLC', 'ASSOCIATE_ADVISOR',
       'LEAD_ADVISOR', 'ANALYST', 'WEALTH_START_DATE_C']]

In [None]:
sfl_cs.columns

In [None]:
sfdc_cs.columns

In [None]:
sfdc_cs

In [None]:
# Limit to Client Survey only found in Client Survey
sfdc_cs = sfdc_cs[sfdc_cs["SURVEY_ID"].isin(sfl_cs["SURVEY_ID"].tolist())]

In [None]:
sfdc_cs

In [None]:
sfl_hh.columns

In [None]:
sfdc_hh.columns

In [None]:
cs_output = compare_datasets(sfl_cs, sfdc_cs, ["SURVEY_ID"], ["SURVEY_DATE", "FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C", "TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"], 
                 ["FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C"], 
                 ["TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"], 
                 ["FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C"], 
                 ["TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"],
                 ["SURVEY_DATE"], 
                 ["SURVEY_DATE"],
                 "_snow", "_sfdc")

In [None]:
hh_output = compare_datasets(sfl_hh, sfdc_hh, ["ID"], ['HOUSEHOLD_NAME', 'BE_WELLTHY_INTRO_DATE_C',
       'CALCULATED_ANNUAL_REVENUE', 'VALUE_STACK_C', 'CLIENT_TIER_C',
       'CLIENT_ENGAGEMENT_RANK_C', 'DSLCR', 'DSLC', 'ASSOCIATE_ADVISOR',
       'LEAD_ADVISOR', 'ANALYST', 'WEALTH_START_DATE_C'], 
       ['HOUSEHOLD_NAME', 'VALUE_STACK_C', 'CLIENT_TIER_C',
       'CLIENT_ENGAGEMENT_RANK_C', 'ASSOCIATE_ADVISOR',
       'LEAD_ADVISOR', 'ANALYST'], 
       ['CALCULATED_ANNUAL_REVENUE', 'DSLCR', 'DSLC'], 
       ['HOUSEHOLD_NAME', 'VALUE_STACK_C', 'CLIENT_TIER_C',
       'CLIENT_ENGAGEMENT_RANK_C', 'ASSOCIATE_ADVISOR',
       'LEAD_ADVISOR', 'ANALYST'], 
       ['CALCULATED_ANNUAL_REVENUE', 'DSLCR', 'DSLC'],
       ['BE_WELLTHY_INTRO_DATE_C', 'WEALTH_START_DATE_C'], 
       ['BE_WELLTHY_INTRO_DATE_C', 'WEALTH_START_DATE_C'],
       "_snow", "_sfdc")

In [None]:
counter = 0
for i in cs_output:
    print(str(counter) + ": ")
    print(i.columns)
    counter = counter + 1
counter = 0

In [None]:
cs_output[6]

In [None]:
cs_output[7]

In [None]:
import datetime

In [None]:
# df[df.the_date_column > datetime.datetime.now() - pd.to_timedelta("30day")]
# datetime.now().astimezone()

In [None]:
sfdc_cs_new = pd.read_csv("sfdc_cs_new.csv")
sfl_cs_new = pd.read_csv("sfl_cs_new.csv")


In [None]:
for i in [sfdc_cs_new, sfl_cs_new]:
    print(i.shape)

In [None]:
sfl_cs_new = sfl_cs_new[["SURVEY_DATE", "FOLLOW_UP_STATUS_C", "SURVEY_ID", "LEAD_ADVISOR_C", "TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"]]
sfdc_cs_new.rename(columns = {'CreatedDate': 'SURVEY_DATE', 'Follow_Up_Status__c': 'FOLLOW_UP_STATUS_C', 'Id': 'SURVEY_ID', 'Lead_Advisor__c': 'LEAD_ADVISOR_C',
       'Trust_Score__c': "TRUST_SCORE_C", 'X2016_Q2__c': 'SATISFACTION_SCORE', 'X2016_Q5__c': 'LIKELY_TO_RECOMMEND'}, inplace = True)


In [None]:
# Limit to Client Survey only found in Client Survey
sfdc_cs_new = sfdc_cs_new[sfdc_cs_new["SURVEY_ID"].isin(sfl_cs_new["SURVEY_ID"].tolist())]

In [None]:
for i in [sfdc_cs_new, sfl_cs_new]:
    print(i.shape)

In [None]:
sfl_cs_new

In [None]:
joined_cs = sfl_cs_new.merge(sfdc_cs_new, on = "SURVEY_ID", how = "left", suffixes = ('_sfl', '_sfdc'))

In [None]:
joined_cs

In [None]:
cs_output_new = compare_datasets(sfl_cs_new, sfdc_cs_new, ["SURVEY_ID"], ["SURVEY_DATE", "FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C", "TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"], 
                 ["FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C"], 
                 ["TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"], 
                 ["FOLLOW_UP_STATUS_C", "LEAD_ADVISOR_C"], 
                 ["TRUST_SCORE_C", "SATISFACTION_SCORE", "LIKELY_TO_RECOMMEND"],
                 ["SURVEY_DATE"], 
                 ["SURVEY_DATE"],
                 "_snow", "_sfdc")

In [None]:
cs_output_new[7]