In [1]:
import pandas as pd
# import pandas.io.sql as psql
# import numpy as np
from sqlalchemy import create_engine, event
import urllib.parse
import time
# import gc

# Inputs & Outputs

In [2]:
# Parameters for Joining data
year = 2016
n_year = 5

# SQL Connection Params
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'

# SQL Tables
# LEFT TABLE
main_sql_table = "linkedclaims_casuals_2018m04_v2_flat_20161001" 
l_uid_var = 'ppsn'

# RIGHT TABLE
join_sql_table = 'new_earnings_11jul2018'
income_var = ['Class_A_Earn','Class_S_Earn','Class_Other_Earn','Class_A_weeks']
r_uid_var = 'RSI_NO'

# OUTPUT TABLE FOR LEFT JOIN
new_sql_table = main_sql_table + "_with_income"


# FIX MISSING VALUES IN RIGHT TABLE AND LEFT JOIN

In [3]:
def replace_last(source_string, replace_what, replace_with):
    head, _sep, tail = source_string.rpartition(replace_what)
    return head + replace_with + tail

proctime = time.time()

# get a list of years to join
myyear =[]
for i in range(1,n_year+1):
    myyear.append(year-i)

# get income variables with years appended for variable selection
myincome_vars = [r_uid_var]
for i in income_var:
    for j in myyear:
        myincome_vars.append(str(i)+str(j))
print('Selecting following variables:')
print (myincome_vars)
        
# create a dictionary to rename income variables
income_var_rename = []
for i in income_var:
    for k , j in zip(myyear, range(0,n_year)):
        key = str(i)+str(k)
        value = str(i) + str(j)
        mydict = {}
        mydict[key] = value
        income_var_rename.append(mydict)
print('Income variables will be renamed as follows:')
print(income_var_rename)
        
# Connect to SQL
params = urllib.parse.quote_plus(odbc_connection_string)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect().connection
cursor = conn.cursor()
        
# ## FIX MISSING VALUES IN CLD
# # Update values in CLD to appropriately fix missing values:
# # '.' ---> NULL 
# # -1 ---> 0 
print ('Replace Missing Values in RIGHT table')
sql_update = "update "+ join_sql_table + " set "
for elem in income_var_rename:
    try:
        sql_update = sql_update+ str(list(elem.keys())[0]) + "='NULL' where " + str(list(elem.keys())[0]) +"='.'" + '\n'
        cursor.execute(sql_update)
        conn.commit()
    except:
        pass
    try:
        sql_update = sql_update+ str(list(elem.keys())[0]) + "='0' where " + str(list(elem.keys())[0]) +"='-1'" + '\n'
        cursor.execute(sql_update)
        conn.commit()
    except:
        pass

    
# Delete Table if exists
print ('Drop new table if exists')
sql_drop_table = "IF OBJECT_ID('"+ new_sql_table + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + new_sql_table
cursor.execute(sql_drop_table)
conn.commit()

# Prepare for Left Join
print ('LEFT JOIN')
select_rename_vars_join_table = r_uid_var+","
for elem in income_var_rename:
    select_rename_vars_join_table = select_rename_vars_join_table + str(list(elem.keys())[0]) + \
                                    ' as ' + str(list(elem.values())[0]) + ','
select_rename_vars_join_table  = replace_last(select_rename_vars_join_table, ',', '')


sql_left_join = 'SELECT A.*, B.* INTO '+ new_sql_table + '\n' +\
                ' FROM ' + main_sql_table + ' AS A LEFT JOIN ' + \
                '(SELECT '+ select_rename_vars_join_table + ' FROM ' + join_sql_table +') AS B \n' + \
                ' ON A.'+l_uid_var+' = B.'+r_uid_var

# Left Join
cursor.execute(sql_left_join)
conn.commit()

# Drop r_uid_var
print ('Drop column %s in new table' %(r_uid_var))
sql_drop_column = "ALTER TABLE " + new_sql_table + " DROP COLUMN " + r_uid_var
cursor.execute(sql_drop_column)
conn.commit()

conn.close()

elapsed_time = time.time() - proctime
print ('\tProcedure time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

print ('\nALL DONE')

Selecting following variables:
['RSI_NO', 'Class_A_Earn2015', 'Class_A_Earn2014', 'Class_A_Earn2013', 'Class_A_Earn2012', 'Class_A_Earn2011', 'Class_S_Earn2015', 'Class_S_Earn2014', 'Class_S_Earn2013', 'Class_S_Earn2012', 'Class_S_Earn2011', 'Class_Other_Earn2015', 'Class_Other_Earn2014', 'Class_Other_Earn2013', 'Class_Other_Earn2012', 'Class_Other_Earn2011', 'Class_A_weeks2015', 'Class_A_weeks2014', 'Class_A_weeks2013', 'Class_A_weeks2012', 'Class_A_weeks2011']
Income variables will be renamed as follows:
[{'Class_A_Earn2015': 'Class_A_Earn0'}, {'Class_A_Earn2014': 'Class_A_Earn1'}, {'Class_A_Earn2013': 'Class_A_Earn2'}, {'Class_A_Earn2012': 'Class_A_Earn3'}, {'Class_A_Earn2011': 'Class_A_Earn4'}, {'Class_S_Earn2015': 'Class_S_Earn0'}, {'Class_S_Earn2014': 'Class_S_Earn1'}, {'Class_S_Earn2013': 'Class_S_Earn2'}, {'Class_S_Earn2012': 'Class_S_Earn3'}, {'Class_S_Earn2011': 'Class_S_Earn4'}, {'Class_Other_Earn2015': 'Class_Other_Earn0'}, {'Class_Other_Earn2014': 'Class_Other_Earn1'}, {'C