In [1]:
from sqlalchemy import create_engine, event
import pyodbc
import urllib.parse
import time
import pandas as pd
import numpy as np

# Inputs & Outputs

In [2]:
# SQL database
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'
#input table
sql_table = "linkedclaims_casuals_2018m04"
#output table
sql_table_out = "linkedclaims_casuals_2018m04_v2"

#input data dictionary file
ddictionaryfilename = "D:/Data/linkedclaims_casuals_2018m04_variables_types.csv"
#output data dictionary file
ddictionaryfilename_out = "D:/Data/linkedclaims_casuals_2018m04_v2_variables_types.csv"

## Define varibales for joining in SQL
datadictionary = pd.read_csv(ddictionaryfilename)
variable_uid = (datadictionary[datadictionary.UID == 1].Variable).tolist()[0]
variable_event_start = (datadictionary[datadictionary.EventStart == 1].Variable).tolist()[0]
variable_event_end = (datadictionary[datadictionary.EventEnd == 1].Variable).tolist()[0]
variable_event_type = 'hist_lr'

# Variable to create from, new variable name and type
variable = ['occupation','LM_code','ada_code','family_flag', 'marital_status']
new_variable = ['occupation_rank','LM_code_rank', 'ada_code_rank','family_flag_rank', 'marital_status_rank']
new_variable_type = ['Num','Num','Num','Num','Num']
ranking_rule = [
                    [
                        {10:["ManagersAndAdministrators"]},
                        {9:["ProfessionalOccupations"]},
                        {8:["AssociateProfessionalAndTechnicalOccupations"]},
                        {7:["ClericalAndSecretarialOccupations"]},
                        {6:["CraftAndRelatedOccupations"]},
                        {5:["PersonalAndProtectiveServiceOccupations"]},
                        {4:["SalesAndCustomerServiceOccupations"]},
                        {3:["PlantAndMachineOperatives"]},
                        {2:["OtherOccupations"]},
                        {1:["UnknownOrNotStatedOccupationOrThoseWhoNeverWorked"]},
                        {np.nan:["NULL"]}
                    ],
                    [
                        {1:['Employment']}, 
                        {3:['Part-Time']},
                        {4:['Activation_and_Emp_Subs']}, 
                        {4:['Education_and_Training']}, 
                        {4:['Income_Subsidies']},
                        {5:['One_Parent_Family']},
                        {6:['Credits']}, 
                        {6:['Unemp_Benefit_Assistance']},
                        {np.nan:["NULL"]}
                    ],
                    [
                        {11:['SpouseEarnings_bigger_than_Euro_400']}, 
                        {10:['SpouseEarnings_between_Euro_310_400']},
                        {9:['SpouseEarnings_bigger_than_Euro_310']},
                        {8:['SpouseEarnings_between_Euro_100_310']},
                        {7:['SpouseEarnings_less_than_Euro_100']},
                        {6:['SpouseHasEarningsPreviously_E_P_T']},
                        {5:['SpouseNoIncome']},
                        {4:['SpouseOnSocialWelfare']},
                        {3:['SpouseUnknown']},
                        {2:['NoSpouse_SingleSeparatedDivorcedWidowed']},
                        {1:['NoDetails']},
                        {np.nan:["NULL"]}
                    ],
                    [
                        {4:['ADA and CDAs']},
                        {3:['ADA only']},
                        {2:['CDAs only']},
                        {1:['No ADA, no CDAs']}, 
                        {np.nan:["NULL"]}
                    ],
                    [
                        {3:['Widowed']},
                        {2:['Married']},
                        {1:['Single']},
                        {np.nan:["NULL"]}
                    ]
    
                ]

# Procedure

In [3]:
# Load Data
print('Loading Data from : %s' %(sql_table))
## Connect to SQL
params = urllib.parse.quote_plus(odbc_connection_string)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect().connection

@event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
    if executemany:
        cursor.fast_executemany = True

sql_query_string = "SELECT "+ (',').join(variable) + \
                    "," + variable_uid + "," + variable_event_start + \
                    "," + variable_event_end + "," + variable_event_type + " FROM "+ sql_table
df = pd.read_sql_query(sql_query_string, engine)

conn.close()

for i in range(len(variable)):
    var = variable[i]
    new_var = new_variable[i]
    new_var_type = new_variable_type[i]
    rule = ranking_rule[i]

    print('Creating new variable %s from %s' %(new_var, var))
    df[new_var] = df[var]

    print('Creating Renamig Dictionaries')
    renaming_dict = []
    old_values = df[new_var].unique().tolist()
    for ov in old_values:
        for rr in rule:
            key = list(rr.keys())[0]
            val = list(rr.values())[0]
            if ov in val:
                mydict = {new_var:{ov: key}}
                renaming_dict.append(mydict)
    
    ## Replace values
    print('Renamig\n')
    for elem in renaming_dict:
#         print (elem)
        df.replace(to_replace=elem, inplace=True)

# Load into SQL
same_table_in_out = False
if sql_table == sql_table_out:
    same_table_in_out = True
    sql_table_out = '_'+sql_table_out
sql_table_temp = sql_table_out+"_temp"

## Connect to SQL
mytime = time.time()
params = urllib.parse.quote_plus(odbc_connection_string)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect().connection
cursor = conn.cursor()

# SpeedUp For fast execution of mutiple row 
@event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
    if executemany:
        cursor.fast_executemany = True

## Drop table if exists
sql_string_drop = "IF OBJECT_ID('"+ sql_table_temp + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table_temp
cursor.execute(sql_string_drop)
conn.commit()

## upload data
print('\nUploading to Temp Table SQL: %s' %(sql_table_temp))
df.drop(columns=variable, inplace=True)
df.to_sql(sql_table_temp, engine, if_exists='append', index=False)

## Left Join
## Drop table if exists
sql_string_drop = "IF OBJECT_ID('"+ sql_table_out + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table_out
cursor.execute(sql_string_drop)
conn.commit()

print('\nJoining Tables into %s' %(sql_table_out))
sql_left_join = 'SELECT A.*, B.* INTO '+ sql_table_out + '\n' +\
                ' FROM ' + sql_table + ' AS A LEFT JOIN ' + \
                '(SELECT '+ (',').join(new_variable) + \
                ',' + variable_uid + ' AS id,' + variable_event_start + ' AS s,' + variable_event_end +\
                ' AS e,' + variable_event_type + ' AS t FROM ' + sql_table_temp +') AS B \n' + \
                ' ON A.'+ variable_uid +' = '+'B.id AND ' +\
                ' A.'+ variable_event_start +' = '+'B.s AND ' +\
                ' A.'+ variable_event_end +' = '+'B.e AND ' +\
                ' A.'+ variable_event_type +' = '+'B.t'

cursor.execute(sql_left_join)
conn.commit()

# Drop id, s, e from joined table
sql_drop_column = "ALTER TABLE " + sql_table_out + " DROP COLUMN " + "id, s, e, t";
cursor.execute(sql_drop_column)
conn.commit()

## Drop temp table if exists
print('\nDrop Temp Table')
sql_string_drop = "IF OBJECT_ID('"+ sql_table_temp + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table_temp
cursor.execute(sql_string_drop)
conn.commit()

## rename out if required
if same_table_in_out == True:
    print ('Copying into output table: %s' %(sql_table))
    sql_string_drop = "IF OBJECT_ID('"+ sql_table + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table
    cursor.execute(sql_string_drop)
    conn.commit()

    sql_copy = 'SELECT * INTO ' + sql_table + ' FROM ' + sql_table_out
    cursor.execute(sql_copy)
    conn.commit()
    
    sql_string_drop = "IF OBJECT_ID('"+ sql_table_out + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table_out
    cursor.execute(sql_string_drop)
    conn.commit()


#Close SQL Connection
conn.close()

elapsed_time = time.time() - mytime
print ('SQL Process time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

# UPDATE DATADICTIONARY FILE WITH NEW COL NAME/TYPE/SELECTED

print('\nUpdate Data Dictionary File: %s' %(ddictionaryfilename_out))
for i in range(len(new_variable)):
    new_var = new_variable[i]
    new_var_type = new_variable_type[i]
    datadictionary = datadictionary.append({'Variable':new_var,'Type':new_var_type,'TotalSummary':1,'EpisodeSummary':1}, ignore_index=True)

datadictionary.to_csv(ddictionaryfilename_out, index=False)

print ('\nALL DONE')

Loading Data from : linkedclaims_casuals_2018m04
Creating new variable occupation_rank from occupation
Creating Renamig Dictionaries
Renamig

Creating new variable LM_code_rank from LM_code
Creating Renamig Dictionaries
Renamig

Creating new variable ada_code_rank from ada_code
Creating Renamig Dictionaries
Renamig

Creating new variable family_flag_rank from family_flag
Creating Renamig Dictionaries
Renamig

Creating new variable marital_status_rank from marital_status
Creating Renamig Dictionaries
Renamig


Uploading to Temp Table SQL: linkedclaims_casuals_2018m04_v2_temp

Joining Tables into linkedclaims_casuals_2018m04_v2

Drop Temp Table
SQL Process time: 00:38:39

Update Data Dictionary File: D:/Data/linkedclaims_casuals_2018m04_v2_variables_types.csv

ALL DONE
