In [1]:
from sqlalchemy import create_engine, event
import pyodbc
import urllib.parse
import time
import pandas as pd
import numpy as np

# Inputs

In [2]:
# JLD-like episode/log file
jld_csv_filename= "D:/Data/linkedclaims_casuals_2018m04.csv"
# jld_csv_filename= "D:/Data/new_earnings_11jul2018.csv"

# JLD-like episode/log file
jld_datadictionary_filename = "D:/Data/linkedclaims_casuals_2018m04_variables_types.csv"
# jld_datadictionary_filename = "D:/Data/new_earnings_11jul2018_variables_types.csv"

# Variable mapping file. SET TO None IF YOU DO NOT NEED/USE IT
jld_variablemapping_filename = "D:/Data/linkedclaims_casuals_2018m04_variables_mapping.csv"
# jld_variablemapping_filename = None

# SQL database
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'


# Ouput Table
sql_table = "linkedclaims_casuals_2018m04"
# sql_table = "new_earnings_11jul2018"

# Set to True if you want to subset (aka selecy only) the variables according to datadictionary
subset_variables = False

# Select Variables, Map Values, Cast Types & Load into SQL

In [3]:
def cast_stype (dframe, dictionary):
    dframe.fillna('NULL', inplace=True)
    for column in dframe:
        for index, row in dictionary.iterrows():
            if column == row['Variable']:
                if row['Type'] == 'Num':
                    dframe[column].replace(to_replace='NULL', value=np.nan, inplace=True)
                    dframe[column] = pd.to_numeric(dframe[column])
                elif row['Type'] == 'Date':
                    dframe[column] = pd.to_datetime(dframe[column], format=row['Format'], errors = 'coerce')
                elif row['Type'] == 'Char':
                    dframe[column] = dframe[column].apply(lambda x: x.encode('utf-8').strip())
    return dframe

def map_values (df, mapping):
    df.fillna('NULL', inplace=True)
    for index, row in mapping.iterrows():
        if row['Variable'] in df.columns:
            mymap = {}
            mymap[row['Variable']] = {row['OriginalValue']:row['TargetValue']}
            df.replace(to_replace=mymap, inplace=True)

# Connect to SQL
params = urllib.parse.quote_plus(odbc_connection_string)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect().connection
cursor = conn.cursor()

# Drop table if exists
sql_string_drop = "IF OBJECT_ID('"+ sql_table+ "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table
cursor.execute(sql_string_drop)
conn.commit()
conn.close()


# Read from data dictionary which variables to select for upload
datadictionary = pd.read_csv(jld_datadictionary_filename)
selected_variables= []
selected_variables = datadictionary[datadictionary.EventStart == 1].Variable
selected_variables = selected_variables.append(datadictionary[datadictionary.EventEnd == 1].Variable)
selected_variables = selected_variables.append(datadictionary[datadictionary.DOB == 1].Variable)
selected_variables = selected_variables.append(datadictionary[datadictionary.UID == 1].Variable)
selected_variables = selected_variables.append(datadictionary[datadictionary.Pinfo == 1].Variable)
selected_variables = selected_variables.append(datadictionary[datadictionary.TotalSummary == 1].Variable)
selected_variables = selected_variables.append(datadictionary[datadictionary.EpisodeSummary == 1].Variable)
selected_variables = list(set(selected_variables.tolist()))

# Read viariables mapping
variables_mapping = None
if jld_variablemapping_filename not in ['',None]:
    variables_mapping = pd.read_csv(jld_variablemapping_filename)

# initiate timing
start_time = time.time()

# Read CSV in chunks and upload into SQL
csv_chunksize = 500000
sql_chunksize = 100000
count = 0
processed = 0
for chunk in pd.read_csv(jld_csv_filename, chunksize=csv_chunksize, dtype=object):
    count = count + 1
    print ('Iteration %d, processing %d rows' %(count, len(chunk)))
    mydf = None
    if subset_variables:
        print('Selecting Variables')
        mytime = time.time()
        mydf = chunk[selected_variables].copy()
        elapsed_time = time.time() - mytime
        print ('\tProcedure time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    else:
        mydf = chunk
    
    if variables_mapping is not None:
        print('Mapping Variables Values')
        mytime = time.time()
        map_values(mydf, variables_mapping)
        elapsed_time = time.time() - mytime
        print ('\tProcedure time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
    print('Casting Variables Types')
    mytime = time.time()
    mydf = cast_stype(mydf, datadictionary)
    elapsed_time = time.time() - mytime
    print ('\tProcedure time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
    print('Uploading to SQL')
    mytime = time.time()
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection
    
    # SpeedUp For fast execution of mutiple row insert 
    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True
    
    #upload data
    mydf.to_sql(sql_table, engine, if_exists='append', index=False, chunksize=sql_chunksize)
    #Close SQL Connection
    conn.close()
    elapsed_time = time.time() - mytime
    print ('\tProcedure time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    processed = processed + len(chunk)
    print ("Iteration = %d, Processed %d rows" %(count,processed))
    elapsed_time = time.time() - start_time
    print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    print ('\n')



print ("\nTotal Processed rows: %d" %(processed))
elapsed_time = time.time() - start_time
print ('Total Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

print ('\nALL DONE')

Iteration 1, processing 500000 rows
Mapping Variables Values
	Procedure time: 00:00:20
Casting Variables Types
	Procedure time: 00:00:18
Uploading to SQL
	Procedure time: 00:01:41
Iteration = 1, Processed 500000 rows
Elapsed time: 00:02:23


Iteration 2, processing 500000 rows
Mapping Variables Values
	Procedure time: 00:00:20
Casting Variables Types
	Procedure time: 00:00:18
Uploading to SQL
	Procedure time: 00:01:46
Iteration = 2, Processed 1000000 rows
Elapsed time: 00:04:51


Iteration 3, processing 500000 rows
Mapping Variables Values
	Procedure time: 00:00:22
Casting Variables Types
	Procedure time: 00:00:19
Uploading to SQL
	Procedure time: 00:01:48
Iteration = 3, Processed 1500000 rows
Elapsed time: 00:07:26


Iteration 4, processing 500000 rows
Mapping Variables Values
	Procedure time: 00:00:20
Casting Variables Types
	Procedure time: 00:00:18
Uploading to SQL
	Procedure time: 00:01:47
Iteration = 4, Processed 2000000 rows
Elapsed time: 00:09:56


Iteration 5, processing 50000