In [1]:
import pandas as pd
import matplotlib.pylab as plt
from matplotlib.patches import Ellipse
import matplotlib.lines as lines
from matplotlib import cm
import numpy as np

from sqlalchemy import create_engine, event
import urllib.parse

# Inputs

In [2]:
# SQL database
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'

sql_table_0 = "linkedclaims_casuals_2018m04_v2_flat_20140101_with_income_with_edu_36Vars__7BGM_full_clusters" 
sql_table_1 = "linkedclaims_casuals_2018m04_v2_flat_20150101_with_income_36Vars__7BGM_full_clusters"
sql_table_2 = "linkedclaims_casuals_2018m04_v2_flat_20160101_with_income_36Vars__7BGM_full_clusters"
sql_tables_all = [sql_table_0, sql_table_1, sql_table_2] # list of all dataset provided

# Procedure

In [3]:
def read_data_from_sql(sql_table):
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection

    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True

    df = pd.read_sql_table(sql_table, engine)
    conn.close()
    return df

for i in range(0, len(sql_tables_all)-1):
    data0 = read_data_from_sql(sql_tables_all[i])
    data0.rename(columns={'cluster':'from_cluster'}, inplace=True)
    
    stability_matrices = []
    for n in range(i+1, len(sql_tables_all)):
        data1 = read_data_from_sql(sql_tables_all[n])
        data1.rename(columns={'cluster':'to_cluster'}, inplace=True)

        data = pd.merge(data0, data1, on='ppsn', how='left')

        clusters = sorted(data0.from_cluster.unique())
        st_matrix = pd.DataFrame()
        for c in clusters:
            res = data.loc[data.from_cluster==c, 'to_cluster'].value_counts(dropna=False)
            res = pd.DataFrame(res)
            res = res.transpose()
            res.index = ['from_cluster_' + str(c)]
            st_matrix = st_matrix.append(res)

        row_sum = np.sum(st_matrix, axis = 1)
        for col in st_matrix.columns:
            st_matrix[col] = st_matrix[col]/row_sum
        st_matrix['Total'] = row_sum

        stability_matrices.append(st_matrix)
        
    for sm_index in range(len(stability_matrices)):
        print ('Comparison:')
        print ('\n from: %s' %(sql_tables_all[i]))
        print ('\n to: %s' %(sql_tables_all[sm_index+1+i]))
        display(stability_matrices[sm_index])
    

Comparison:

 from: linkedclaims_casuals_2018m04_v2_flat_20140101_with_income_with_edu_36Vars__7BGM_full_clusters

 to: linkedclaims_casuals_2018m04_v2_flat_20150101_with_income_36Vars__7BGM_full_clusters


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,nan,Total
from_cluster_0,0.655914,0.000574,0.016197,0.001228,0.05123,,0.00138,0.273478,125396.0
from_cluster_1,0.061312,0.002949,0.000177,0.010704,0.00895,0.467482,0.014761,0.433665,147148.0
from_cluster_2,0.014523,0.001218,0.000683,0.021472,0.133884,0.393217,0.015117,0.419887,33671.0
from_cluster_3,0.000102,0.004753,0.178557,0.046453,,0.000818,0.362837,0.40648,19568.0
from_cluster_4,0.018517,0.00358,0.054378,0.049734,0.005259,0.058797,0.345399,0.464336,17875.0
from_cluster_5,0.054821,0.466955,0.009321,0.002264,0.010796,0.037759,0.010954,0.40713,18989.0
from_cluster_6,0.022769,0.009343,0.438279,0.000765,0.003029,0.011926,0.197296,0.316592,31359.0


Comparison:

 from: linkedclaims_casuals_2018m04_v2_flat_20140101_with_income_with_edu_36Vars__7BGM_full_clusters

 to: linkedclaims_casuals_2018m04_v2_flat_20160101_with_income_36Vars__7BGM_full_clusters


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,nan,Total
from_cluster_0,0.469321,0.001045,0.02197,0.002392,0.097228,,0.005335,0.402708,125396.0
from_cluster_1,0.073246,0.003609,0.0021,0.010214,0.013578,0.30328,0.018974,0.574999,147148.0
from_cluster_2,0.034095,0.002138,0.001307,0.019186,0.004989,0.350628,0.022839,0.564818,33671.0
from_cluster_3,0.000204,0.007052,0.145135,0.021208,0.000102,0.000409,0.292263,0.533626,19568.0
from_cluster_4,0.040336,0.004196,0.043245,0.02372,0.011413,0.105119,0.172923,0.599049,17875.0
from_cluster_5,0.066776,0.267049,0.015219,0.002475,0.01701,0.05024,0.015061,0.56617,18989.0
from_cluster_6,0.05166,0.014318,0.309544,0.001945,0.016136,0.027074,0.121783,0.45754,31359.0


Comparison:

 from: linkedclaims_casuals_2018m04_v2_flat_20150101_with_income_36Vars__7BGM_full_clusters

 to: linkedclaims_casuals_2018m04_v2_flat_20160101_with_income_36Vars__7BGM_full_clusters


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,nan,Total
from_cluster_0,0.642571,0.000823,0.01208,0.000877,0.051123,9e-06,0.000779,0.291738,111751
from_cluster_1,0.040766,0.463652,0.011611,0.001355,0.008127,0.0318,0.01045,0.432239,15503
from_cluster_2,0.052586,0.008096,0.574358,0.003329,0.006621,0.000454,0.041804,0.312753,26433
from_cluster_3,0.024952,0.005194,0.018404,0.229649,0.013323,0.083324,0.134696,0.49046,8857
from_cluster_4,0.050751,0.001124,0.002772,0.013746,0.299674,0.191281,0.010712,0.429941,26699
from_cluster_5,0.045278,0.002247,0.000111,0.009463,0.007639,0.471843,0.012059,0.451359,134835
from_cluster_6,0.008907,0.005446,0.043572,0.006278,0.007486,0.053633,0.411676,0.463002,37272
