# Анализ таблиц JDWH - Прод vs Тест

## Часть II. Таблицы и партиции.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

## 1. Загрузка и первичное преобразование данных
(Это подготовка. Отчеты идут с раздела 2.)

### Прод

In [3]:
prod = pd.read_csv('./data/tables&parts_prod_14122020.csv')

In [4]:
prod.head(3)

Unnamed: 0,table_schema,table_name,base_name,parts_count,part
0,dwh_bckp,amara_corr_overdues,amara_corr_overdues,1,1
1,dwh_bckp,amara_rating_fitch,amara_rating_fitch,1,1
2,dwh_bckp,amara_rating_fitch_bckp,amara_rating_fitch_bckp,1,1


Сгруппируем данные по table_schema, base_name (группировка по партициям)

In [5]:
prod_gr = prod.groupby(['table_schema', 'base_name']).agg(
    parts_count = pd.NamedAgg(column = 'table_name', aggfunc = 'count'),
)

In [6]:
prod_gr.reset_index(level=['table_schema', 'base_name'], inplace=True)
prod_gr.head(3)

Unnamed: 0,table_schema,base_name,parts_count
0,dwh_bckp,amara_corr_overdues,1
1,dwh_bckp,amara_rating_fitch,1
2,dwh_bckp,amara_rating_fitch_bckp,1


### Тест

In [7]:
test = pd.read_csv('./data/tables&parts_test_14122020.csv')

In [8]:
test.head(3)

Unnamed: 0,table_schema,table_name,base_name,parts_count,part
0,dwh_bckp,amara_daughter_companies_bckp,amara_daughter_companies_bckp,1,1
1,dwh_bckp,amara_rating_fitch,amara_rating_fitch,1,1
2,dwh_bckp,amara_rating_fitch_bckp,amara_rating_fitch_bckp,1,1


Сгруппируем данные по table_schema, base_name (группировка по партициям)

In [9]:
test_gr = test.groupby(['table_schema', 'base_name']).agg(
    parts_count = pd.NamedAgg(column = 'table_name', aggfunc = 'count'),
)

In [10]:
test_gr.reset_index(level=['table_schema', 'base_name'], inplace=True)
test_gr.head(3)

Unnamed: 0,table_schema,base_name,parts_count
0,dwh_bckp,amara_daughter_companies_bckp,1
1,dwh_bckp,amara_rating_fitch,1
2,dwh_bckp,amara_rating_fitch_bckp,1


## 2. Количество таблиц по схемам

### Прод

Все

In [11]:
prod.table_schema.value_counts()

dwh_stage      1872
dwh_dm          895
dwh_bckp        763
dwh_dds         501
dwh_polygon     138
dwh_draft        25
dwh_dq           10
dwh_private       4
Name: table_schema, dtype: int64

С группировкой по партициям

In [12]:
prod_gr.table_schema.value_counts()

dwh_stage      299
dwh_dds        123
dwh_polygon    105
dwh_bckp        29
dwh_draft       25
dwh_dm          22
dwh_dq          10
dwh_private      4
Name: table_schema, dtype: int64

### ТЕСТ

Все

In [13]:
test.table_schema.value_counts()

dwh_stage      2267
dwh_dm         2219
dwh_bckp        768
dwh_dds         658
dwh_dq          377
dwh_polygon     187
dwh_rb           37
dwh_draft        29
Name: table_schema, dtype: int64

С группировкой по партициям

In [14]:
test_gr.table_schema.value_counts()

dwh_stage      303
dwh_polygon    187
dwh_dds        123
dwh_rb          37
dwh_bckp        34
dwh_dm          30
dwh_draft       29
dwh_dq          10
Name: table_schema, dtype: int64

### Соединяем таблицы методом full outer join

In [15]:
joined_gr = prod_gr.merge(test_gr, on=['table_schema', 'base_name'], how='outer', suffixes=('_prd', '_tst'))

Есть и на Прод и на Тест

In [16]:
tables_on_both = joined_gr[~pd.isna(joined_gr.parts_count_prd) & ~pd.isna(joined_gr.parts_count_tst)]
tables_on_both.table_schema.value_counts()

dwh_stage      288
dwh_dds        114
dwh_polygon     46
dwh_bckp        27
dwh_dm          22
dwh_draft       20
dwh_dq          10
Name: table_schema, dtype: int64

Есть на Прод, но нет на Тест

In [17]:
joined_gr[~pd.isna(joined_gr.parts_count_prd) & pd.isna(joined_gr.parts_count_tst)].table_schema.value_counts()

dwh_polygon    59
dwh_stage      11
dwh_dds         9
dwh_draft       5
dwh_private     4
dwh_bckp        2
Name: table_schema, dtype: int64

Есть на Тест, но нет на Прод

In [18]:
joined_gr[pd.isna(joined_gr.parts_count_prd) & ~pd.isna(joined_gr.parts_count_tst)].table_schema.value_counts()

dwh_polygon    141
dwh_rb          37
dwh_stage       15
dwh_dds          9
dwh_draft        9
dwh_dm           8
dwh_bckp         7
Name: table_schema, dtype: int64

Таблицы в схеме DDS, которые есть на Тест, но нет на Прод

In [19]:
test_no_prod = joined_gr[pd.isna(joined_gr.parts_count_prd) & ~pd.isna(joined_gr.parts_count_tst)]
test_no_prod[test_no_prod.table_schema == 'dwh_dds']

Unnamed: 0,table_schema,base_name,parts_count_prd,parts_count_tst
624,dwh_dds,acc_account_balance_tmp_dar_1,,1.0
625,dwh_dds,acc_account_balance_tmp_dar_2,,1.0
626,dwh_dds,acc_account_balance_tmp_dar_3,,1.0
627,dwh_dds,acc_account_change_dprt,,1.0
628,dwh_dds,dict_contact_type,,1.0
629,dwh_dds,hlp_dict_kp_prefs,,1.0
630,dwh_dds,ref_deal_deposit_test,,1.0
631,dwh_dds,ref_sc_issue_sсhedule,,1.0
632,dwh_dds,ref_sc_lot_ai,,1.0


## 3. Сравнение таблиц по партициям

In [20]:
prod_for_join = prod.merge(tables_on_both, on=['table_schema', 'base_name'], how='inner').\
                drop(['part', 'parts_count_prd', 'parts_count_tst'], axis=1)
prod_for_join.head(3)

Unnamed: 0,table_schema,table_name,base_name,parts_count
0,dwh_bckp,amara_rating_fitch,amara_rating_fitch,1
1,dwh_bckp,amara_rating_fitch_bckp,amara_rating_fitch_bckp,1
2,dwh_bckp,amara_rating_fitch_forecast,amara_rating_fitch_forecast,1


In [21]:
len(prod_for_join)

4085

In [22]:
prod_for_join.table_schema.value_counts()

dwh_stage      1861
dwh_dm          895
dwh_bckp        761
dwh_dds         492
dwh_polygon      46
dwh_draft        20
dwh_dq           10
Name: table_schema, dtype: int64

In [23]:
test_for_join = test.merge(tables_on_both, on=['table_schema', 'base_name'], how='inner').\
                drop(['part', 'parts_count_prd', 'parts_count_tst'], axis=1)
test_for_join.head(3)

Unnamed: 0,table_schema,table_name,base_name,parts_count
0,dwh_bckp,amara_rating_fitch,amara_rating_fitch,1
1,dwh_bckp,amara_rating_fitch_bckp,amara_rating_fitch_bckp,1
2,dwh_bckp,amara_rating_fitch_forecast,amara_rating_fitch_forecast,1


In [24]:
len(test_for_join)

4793

In [25]:
test_for_join.table_schema.value_counts()

dwh_stage      1861
dwh_dm         1079
dwh_bckp        761
dwh_dds         649
dwh_dq          377
dwh_polygon      46
dwh_draft        20
Name: table_schema, dtype: int64

DWH_DM

In [26]:
prod_for_join_dm = prod_for_join[prod_for_join.table_schema == 'dwh_dm']
test_for_join_dm = test_for_join[test_for_join.table_schema == 'dwh_dm']

Проверка

In [27]:
prod_dm_gr = prod_for_join_dm.groupby(['table_schema', 'base_name']).agg(
    parts_count = pd.NamedAgg(column = 'parts_count', aggfunc = 'max'),
)
prod_dm_gr.reset_index(level=['table_schema', 'base_name'], inplace=True)
prod_dm_gr.head(3)

Unnamed: 0,table_schema,base_name,parts_count
0,dwh_dm,dm_700h_rep,1
1,dwh_dm,dm_client_corp,1
2,dwh_dm,dm_doc_payment,158


In [28]:
len(prod_dm_gr)

22

In [29]:
test_dm_gr = test_for_join_dm.groupby(['table_schema', 'base_name']).agg(
    parts_count = pd.NamedAgg(column = 'parts_count', aggfunc = 'max'),
)
test_dm_gr.reset_index(level=['table_schema', 'base_name'], inplace=True)
test_dm_gr.head(3)

Unnamed: 0,table_schema,base_name,parts_count
0,dwh_dm,dm_700h_rep,1
1,dwh_dm,dm_client_corp,1
2,dwh_dm,dm_doc_payment,158


In [30]:
len(test_dm_gr)

22

Попробуем соединить таблицы по имени (партициям)

In [31]:
join_prod_test = prod_for_join_dm.merge(test_for_join_dm, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
516,dwh_dm,dm_z10_1_prt_191_2022_08_23,dm_z10,211.0,dm_z10,211.0
569,dwh_dm,dm_z10_1_prt_50_2019_12_10,dm_z10,211.0,dm_z10,211.0
485,dwh_dm,dm_z10_1_prt_162_2022_02_01,dm_z10,211.0,dm_z10,211.0


Есть на Прод, но нет на Тест

In [32]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
168,dwh_dm,dm_transaction_1_prt_1,dm_transaction,246.0,,


Есть на Тест, но нет на Прод

In [33]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
895,dwh_dm,dm_profit_loss_1_prt_10,,,dm_profit_loss,149.0
896,dwh_dm,dm_profit_loss_1_prt_100,,,dm_profit_loss,149.0
897,dwh_dm,dm_profit_loss_1_prt_101,,,dm_profit_loss,149.0
898,dwh_dm,dm_profit_loss_1_prt_102,,,dm_profit_loss,149.0
899,dwh_dm,dm_profit_loss_1_prt_103,,,dm_profit_loss,149.0
900,dwh_dm,dm_profit_loss_1_prt_104,,,dm_profit_loss,149.0
901,dwh_dm,dm_profit_loss_1_prt_105,,,dm_profit_loss,149.0
902,dwh_dm,dm_profit_loss_1_prt_106,,,dm_profit_loss,149.0
903,dwh_dm,dm_profit_loss_1_prt_107,,,dm_profit_loss,149.0
904,dwh_dm,dm_profit_loss_1_prt_108,,,dm_profit_loss,149.0


In [34]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)].base_name_tst.value_counts()

dm_profit_loss    148
dm_stock_lot       36
dm_transaction      1
Name: base_name_tst, dtype: int64

DWH_DDS

In [35]:
prod_for_join_dds = prod_for_join[prod_for_join.table_schema == 'dwh_dds']
test_for_join_dds = test_for_join[test_for_join.table_schema == 'dwh_dds']

In [36]:
join_prod_test = prod_for_join_dds.merge(test_for_join_dds, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
551,dwh_dds,acc_doc_payment_1_prt_152,,,acc_doc_payment,158
74,dwh_dds,acc_transaction_1_prt_15_2019_02_12,acc_transaction,368.0,acc_transaction,368
230,dwh_dds,acc_transaction_1_prt_292_2021_05_23,acc_transaction,368.0,acc_transaction,368


Есть на Прод, но нет на Тест

In [37]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


Есть на Тест, но нет на Прод

In [38]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
492,dwh_dds,acc_doc_payment_1_prt_1,,,acc_doc_payment,158
493,dwh_dds,acc_doc_payment_1_prt_10,,,acc_doc_payment,158
494,dwh_dds,acc_doc_payment_1_prt_100,,,acc_doc_payment,158
495,dwh_dds,acc_doc_payment_1_prt_101,,,acc_doc_payment,158
496,dwh_dds,acc_doc_payment_1_prt_102,,,acc_doc_payment,158
497,dwh_dds,acc_doc_payment_1_prt_103,,,acc_doc_payment,158
498,dwh_dds,acc_doc_payment_1_prt_104,,,acc_doc_payment,158
499,dwh_dds,acc_doc_payment_1_prt_105,,,acc_doc_payment,158
500,dwh_dds,acc_doc_payment_1_prt_106,,,acc_doc_payment,158
501,dwh_dds,acc_doc_payment_1_prt_107,,,acc_doc_payment,158


In [39]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)].base_name_tst.value_counts()

acc_doc_payment    157
Name: base_name_tst, dtype: int64

DWH_BCKP

In [40]:
prod_for_join_bckp = prod_for_join[prod_for_join.table_schema == 'dwh_bckp']
test_for_join_bckp = test_for_join[test_for_join.table_schema == 'dwh_bckp']

In [41]:
join_prod_test = prod_for_join_bckp.merge(test_for_join_bckp, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
703,dwh_bckp,s01_t_dea_bkp_1_prt_49_2019_05_25,s01_t_dea_bkp,368,s01_t_dea_bkp,368
548,dwh_bckp,s01_t_dea_bkp_1_prt_239_2020_12_15,s01_t_dea_bkp,368,s01_t_dea_bkp,368
365,dwh_bckp,s01_led_acc_det_bkp_1_prt_76_2019_08_14,s01_led_acc_det_bkp,368,s01_led_acc_det_bkp,368


Есть на Прод, но нет на Тест

In [42]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


Есть на Тест, но нет на Прод

In [43]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


Партиции на Прод и Тест одни и те же

DWH_STAGE

In [44]:
prod_for_join_stage = prod_for_join[prod_for_join.table_schema == 'dwh_stage']
test_for_join_stage = test_for_join[test_for_join.table_schema == 'dwh_stage']

In [45]:
join_prod_test = prod_for_join_stage.merge(test_for_join_stage, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
2131,dwh_stage,s01_t_trndtl_1_prt_342_2022_10_20,,,s01_t_trndtl,368.0
1933,dwh_stage,s01_t_trndtl_1_prt_164_2021_05_04,,,s01_t_trndtl,368.0
1782,dwh_stage,s02_gl_macro_1_prt_74_2019_08_08,s02_gl_macro,368.0,s02_gl_macro,368.0


Есть на Прод, но нет на Тест

In [46]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
1045,dwh_stage,s01_t_trndtl_1_prt_100_2019_10_25,s01_t_trndtl,368.0,,
1046,dwh_stage,s01_t_trndtl_1_prt_101_2019_10_28,s01_t_trndtl,368.0,,
1047,dwh_stage,s01_t_trndtl_1_prt_10_2019_01_28,s01_t_trndtl,368.0,,
1048,dwh_stage,s01_t_trndtl_1_prt_102_2019_10_31,s01_t_trndtl,368.0,,
1049,dwh_stage,s01_t_trndtl_1_prt_103_2019_11_03,s01_t_trndtl,368.0,,
1050,dwh_stage,s01_t_trndtl_1_prt_104_2019_11_06,s01_t_trndtl,368.0,,
1051,dwh_stage,s01_t_trndtl_1_prt_105_2019_11_09,s01_t_trndtl,368.0,,
1052,dwh_stage,s01_t_trndtl_1_prt_106_2019_11_12,s01_t_trndtl,368.0,,
1053,dwh_stage,s01_t_trndtl_1_prt_107_2019_11_15,s01_t_trndtl,368.0,,
1054,dwh_stage,s01_t_trndtl_1_prt_108_2019_11_18,s01_t_trndtl,368.0,,


In [47]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)].base_name_prd.value_counts()

s01_t_trndtl    366
Name: base_name_prd, dtype: int64

Есть на Тест, но нет на Прод

In [48]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
1861,dwh_stage,s01_t_trndtl_1_prt_100_2020_10_24,,,s01_t_trndtl,368.0
1862,dwh_stage,s01_t_trndtl_1_prt_101_2020_10_27,,,s01_t_trndtl,368.0
1863,dwh_stage,s01_t_trndtl_1_prt_10_2020_01_28,,,s01_t_trndtl,368.0
1864,dwh_stage,s01_t_trndtl_1_prt_102_2020_10_30,,,s01_t_trndtl,368.0
1865,dwh_stage,s01_t_trndtl_1_prt_103_2020_11_02,,,s01_t_trndtl,368.0
1866,dwh_stage,s01_t_trndtl_1_prt_104_2020_11_05,,,s01_t_trndtl,368.0
1867,dwh_stage,s01_t_trndtl_1_prt_105_2020_11_08,,,s01_t_trndtl,368.0
1868,dwh_stage,s01_t_trndtl_1_prt_106_2020_11_11,,,s01_t_trndtl,368.0
1869,dwh_stage,s01_t_trndtl_1_prt_107_2020_11_14,,,s01_t_trndtl,368.0
1870,dwh_stage,s01_t_trndtl_1_prt_108_2020_11_17,,,s01_t_trndtl,368.0


In [49]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)].base_name_tst.value_counts()

s01_t_trndtl    366
Name: base_name_tst, dtype: int64

DWH_DQ

In [50]:
prod_for_join_dq = prod_for_join[prod_for_join.table_schema == 'dwh_dq']
test_for_join_dq = test_for_join[test_for_join.table_schema == 'dwh_dq']

In [51]:
join_prod_test = prod_for_join_dq.merge(test_for_join_dq, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
313,dwh_dq,colvir_balance_1_prt_42_2019_05_04,,,colvir_balance,368
202,dwh_dq,colvir_balance_1_prt_272_2021_03_24,,,colvir_balance,368
235,dwh_dq,colvir_balance_1_prt_302_2021_06_22,,,colvir_balance,368


Есть на Прод, но нет на Тест

In [52]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


Есть на Тест, но нет на Прод

In [53]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
10,dwh_dq,colvir_balance_1_prt_100_2019_10_25,,,colvir_balance,368
11,dwh_dq,colvir_balance_1_prt_101_2019_10_28,,,colvir_balance,368
12,dwh_dq,colvir_balance_1_prt_10_2019_01_28,,,colvir_balance,368
13,dwh_dq,colvir_balance_1_prt_102_2019_10_31,,,colvir_balance,368
14,dwh_dq,colvir_balance_1_prt_103_2019_11_03,,,colvir_balance,368
15,dwh_dq,colvir_balance_1_prt_104_2019_11_06,,,colvir_balance,368
16,dwh_dq,colvir_balance_1_prt_105_2019_11_09,,,colvir_balance,368
17,dwh_dq,colvir_balance_1_prt_106_2019_11_12,,,colvir_balance,368
18,dwh_dq,colvir_balance_1_prt_107_2019_11_15,,,colvir_balance,368
19,dwh_dq,colvir_balance_1_prt_108_2019_11_18,,,colvir_balance,368


In [54]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)].base_name_tst.value_counts()

colvir_balance    367
Name: base_name_tst, dtype: int64

DWH_POLYGON

In [55]:
prod_for_join_polygon = prod_for_join[prod_for_join.table_schema == 'dwh_polygon']
test_for_join_polygon = test_for_join[test_for_join.table_schema == 'dwh_polygon']

In [56]:
join_prod_test = prod_for_join_polygon.merge(test_for_join_polygon, on=['table_schema', 'table_name'], how='outer', suffixes=('_prd', '_tst'))
join_prod_test.sample(3)

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst
33,dwh_polygon,ref_payment_card_dar,ref_payment_card_dar,1,ref_payment_card_dar,1
3,dwh_polygon,acc_doc_temp,acc_doc_temp,1,acc_doc_temp,1
41,dwh_polygon,s01_t_procmem_bck_160720,s01_t_procmem_bck_160720,1,s01_t_procmem_bck_160720,1


Есть на Прод, но нет на Тест

In [57]:
join_prod_test[~pd.isna(join_prod_test.base_name_prd) & pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


Есть на Тест, но нет на Прод

In [58]:
join_prod_test[pd.isna(join_prod_test.base_name_prd) & ~pd.isna(join_prod_test.base_name_tst)]

Unnamed: 0,table_schema,table_name,base_name_prd,parts_count_prd,base_name_tst,parts_count_tst


На Прод и Тест одинаково

## Часть 3. Индексы.

## 1. Загрузка и первичное преобразование данных
### Здесь загрузка идет заново, только из таблиц, где еть индексы

### Прод

In [59]:
prod = pd.read_csv('./data/indexes_prod_14122020.csv')

In [60]:
prod.head(3)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef
0,dwh_bckp,amara_corr_overdues,amara_corr_overdues,pk_bckp_amara_corr_overdues,1,1,CREATE UNIQUE INDEX pk_bckp_amara_corr_overdue...
1,dwh_bckp,amara_rating_fitch,amara_rating_fitch,pk_bckp_amara_rating_fitch,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...
2,dwh_bckp,amara_rating_fitch_forecast,amara_rating_fitch_forecast,pk_bckp_amara_rating_fitch_forecast,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...


Добавим поле full_table_name

In [61]:
prod['full_table_name'] = prod['schemaname'] + '.' + prod['tablename']

Добавляем тип индекска и его короткое определение

In [62]:
prod['short_idx_def'] = prod.indexdef.apply(lambda x: x.split('USING')[1].strip())
prod['idx_type'] = prod.short_idx_def.apply(lambda x: x.split('(')[0].strip())
prod['ind_fields'] = prod.short_idx_def.apply(lambda x: x.split('(')[1].split(')')[0].strip())

In [63]:
prod.head(3)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef,full_table_name,short_idx_def,idx_type,ind_fields
0,dwh_bckp,amara_corr_overdues,amara_corr_overdues,pk_bckp_amara_corr_overdues,1,1,CREATE UNIQUE INDEX pk_bckp_amara_corr_overdue...,dwh_bckp.amara_corr_overdues,"btree (id, insert_date)",btree,"id, insert_date"
1,dwh_bckp,amara_rating_fitch,amara_rating_fitch,pk_bckp_amara_rating_fitch,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...,dwh_bckp.amara_rating_fitch,"btree (id, insert_date)",btree,"id, insert_date"
2,dwh_bckp,amara_rating_fitch_forecast,amara_rating_fitch_forecast,pk_bckp_amara_rating_fitch_forecast,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...,dwh_bckp.amara_rating_fitch_forecast,"btree (id, insert_date)",btree,"id, insert_date"


Убираем партиции

In [64]:
prod = prod[prod['schemaname'] + '.' + prod['tablename'] == prod['schemaname'] + '.' + prod['base_name']]
prod.sample(3)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef,full_table_name,short_idx_def,idx_type,ind_fields
3888,dwh_dds,ref_sc_identcode,ref_sc_identcode,ref_sc_identcode_tmp_v2_pk,6,1,CREATE UNIQUE INDEX ref_sc_identcode_tmp_v2_pk...,dwh_dds.ref_sc_identcode,btree (scidc_id),btree,scidc_id
7956,dwh_stage,s03_ekko,s03_ekko,s03_ekko_ebeln_idx,2,1,CREATE INDEX s03_ekko_ebeln_idx ON dwh_stage.s...,dwh_stage.s03_ekko,btree (ebeln),btree,ebeln
5315,dwh_stage,s01_i_agnrtg,s01_i_agnrtg,pk_s01_i_agnrtg,1,1,CREATE UNIQUE INDEX pk_s01_i_agnrtg ON dwh_sta...,dwh_stage.s01_i_agnrtg,btree (id),btree,id


==============================================================================================================

### Тест

In [65]:
test = pd.read_csv('./data/indexes_test_14122020.csv')

In [66]:
test.head(3)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef
0,dwh_bckp,amara_rating_fitch,amara_rating_fitch,pk_bckp_amara_rating_fitch,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...
1,dwh_bckp,amara_rating_fitch_forecast,amara_rating_fitch_forecast,pk_bckp_amara_rating_fitch_forecast,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...
2,dwh_bckp,amara_rating_fitch_short,amara_rating_fitch_short,pk_bckp_amara_rating_fitch_short,1,1,CREATE UNIQUE INDEX pk_bckp_amara_rating_fitch...


Добавим поле full_table_name

In [67]:
test['full_table_name'] = test['schemaname'] + '.' + test['tablename']

Добавляем тип индекска и его короткое определение

In [68]:
test['short_idx_def'] = test.indexdef.apply(lambda x: x.split('USING')[1].strip())
test['idx_type'] = test.short_idx_def.apply(lambda x: x.split('(')[0].strip())
test['ind_fields'] = test.short_idx_def.apply(lambda x: x.split('(')[1].split(')')[0].strip())

In [69]:
test.sample(3)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef,full_table_name,short_idx_def,idx_type,ind_fields
4070,dwh_dds,dict_trn_operation,dict_trn_operation,dict_trn_operation_pk,1,1,CREATE UNIQUE INDEX dict_trn_operation_pk ON d...,dwh_dds.dict_trn_operation,btree (trnoprn_cd),btree,trnoprn_cd
4823,dwh_dm,dm_profit_loss,dm_profit_loss_1_prt_144,dm_profit_loss_1_prt_144_dm_pnl$report_date_idx,5,1,"CREATE INDEX ""dm_profit_loss_1_prt_144_dm_pnl$...",dwh_dm.dm_profit_loss_1_prt_144,"bitmap (""dm_pnl$report_date"")",bitmap,"""dm_pnl$report_date"""
4368,dwh_dm,dm_doc_payment,dm_doc_payment_1_prt_149,dm_doc_payment_1_prt_149_dm_docp$report_date_idx,2,1,"CREATE INDEX ""dm_doc_payment_1_prt_149_dm_docp...",dwh_dm.dm_doc_payment_1_prt_149,"bitmap (""dm_docp$report_date"")",bitmap,"""dm_docp$report_date"""


Убираем партиции

In [70]:
test = test[test['schemaname'] + '.' + test['tablename'] == test['schemaname'] + '.' + test['base_name']]
test.sample(10)

Unnamed: 0,schemaname,base_name,tablename,indexname,inds_count,ind,indexdef,full_table_name,short_idx_def,idx_type,ind_fields
4067,dwh_dds,dict_service_pack,dict_service_pack,dict_service_pack_pk,1,1,CREATE UNIQUE INDEX dict_service_pack_pk ON dw...,dwh_dds.dict_service_pack,btree (srvp_cd),btree,srvp_cd
6690,dwh_stage,s01_m_cshordexc,s01_m_cshordexc,pk_m_cshordexc,1,1,CREATE UNIQUE INDEX pk_m_cshordexc ON dwh_stag...,dwh_stage.s01_m_cshordexc,btree (id),btree,id
6650,dwh_stage,s01_g_identdocdsc_std,s01_g_identdocdsc_std,s01_g_identdocdsc_brn_name,3,3,CREATE INDEX s01_g_identdocdsc_brn_name ON dwh...,dwh_stage.s01_g_identdocdsc_std,"btree (brn_id, name)",btree,"brn_id, name"
4254,dwh_dm,dm_all_matrix,dm_all_matrix,dm_all_matrix_dm_allmtx$report_date_IDX,1,1,"CREATE INDEX ""dm_all_matrix_dm_allmtx$report_d...",dwh_dm.dm_all_matrix,"bitmap (""dm_allmtx$report_date"", dm_allmtx_rep...",bitmap,"""dm_allmtx$report_date"", dm_allmtx_report_name"
4113,dwh_dds,ref_deal_forex,ref_deal_forex,ref_deal_forex_dlfx_bonus_crncy_cd_idx,16,1,CREATE INDEX ref_deal_forex_dlfx_bonus_crncy_c...,dwh_dds.ref_deal_forex,btree (dlfx_bonus_crncy_cd),btree,dlfx_bonus_crncy_cd
6578,dwh_stage,s01_c_imefrmtyp_std,s01_c_imefrmtyp_std,pk_s01_c_imefrmtyp_std,1,1,CREATE UNIQUE INDEX pk_s01_c_imefrmtyp_std ON ...,dwh_stage.s01_c_imefrmtyp_std,btree (id),btree,id
6649,dwh_stage,s01_g_identdocdsc_std,s01_g_identdocdsc_std,s01_g_identdocdsc_brn_code,3,2,CREATE INDEX s01_g_identdocdsc_brn_code ON dwh...,dwh_stage.s01_g_identdocdsc_std,"btree (brn_id, code)",btree,"brn_id, code"
6760,dwh_stage,s01_t_arlclc,s01_t_arlclc,s01_t_arlclc_id_pk,2,2,CREATE UNIQUE INDEX s01_t_arlclc_id_pk ON dwh_...,dwh_stage.s01_t_arlclc,btree (id),btree,id
2233,dwh_dds,acc_doc_payment,acc_doc_payment,acc_doc_payment_docp_gid_idx,2,1,CREATE UNIQUE INDEX acc_doc_payment_docp_gid_i...,dwh_dds.acc_doc_payment,btree (docp_gid),btree,docp_gid
6544,dwh_draft,amara_rating_fitch_short,amara_rating_fitch_short,pk_amara_rating_fitch_short,1,1,CREATE UNIQUE INDEX pk_amara_rating_fitch_shor...,dwh_draft.amara_rating_fitch_short,btree (id),btree,id


In [71]:
prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581 entries, 0 to 7959
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   schemaname       581 non-null    object
 1   base_name        581 non-null    object
 2   tablename        581 non-null    object
 3   indexname        581 non-null    object
 4   inds_count       581 non-null    int64 
 5   ind              581 non-null    int64 
 6   indexdef         581 non-null    object
 7   full_table_name  581 non-null    object
 8   short_idx_def    581 non-null    object
 9   idx_type         581 non-null    object
 10  ind_fields       581 non-null    object
dtypes: int64(2), object(9)
memory usage: 54.5+ KB


In [72]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 594 entries, 0 to 11540
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   schemaname       594 non-null    object
 1   base_name        594 non-null    object
 2   tablename        594 non-null    object
 3   indexname        594 non-null    object
 4   inds_count       594 non-null    int64 
 5   ind              594 non-null    int64 
 6   indexdef         594 non-null    object
 7   full_table_name  594 non-null    object
 8   short_idx_def    594 non-null    object
 9   idx_type         594 non-null    object
 10  ind_fields       594 non-null    object
dtypes: int64(2), object(9)
memory usage: 55.7+ KB


### Сверяем только те таблицы, что есть и на Прод и на Тест

Удалим лишние столбцы

Прод

In [73]:
prod.drop(['base_name', 'indexdef', 'short_idx_def'], inplace=True, axis = 1)
prod.sample(3)

Unnamed: 0,schemaname,tablename,indexname,inds_count,ind,full_table_name,idx_type,ind_fields
3891,dwh_dds,ref_sc_identcode,ref_sc_identcode_tmp_v2_scidc_sccd_cd_idx,6,4,dwh_dds.ref_sc_identcode,btree,scidc_sccd_cd
3862,dwh_dds,ref_gen_agreement,ref_gen_agreement_gnagr_usr_gid_idx,7,5,dwh_dds.ref_gen_agreement,btree,gnagr_usr_gid
5374,dwh_stage,s01_s_ordpay,s01_s_ordpay_dval_idx,2,1,dwh_stage.s01_s_ordpay,btree,dval


Test

In [74]:
test.drop(['base_name', 'indexdef', 'short_idx_def'], inplace=True, axis = 1)
test.sample(3)

Unnamed: 0,schemaname,tablename,indexname,inds_count,ind,full_table_name,idx_type,ind_fields
5318,dwh_dm,dm_sb7,dm_sb7$report_date_bmx,2,1,dwh_dm.dm_sb7,bitmap,report_date
6724,dwh_stage,s01_s_trfgrp,s01_s_trfgrp_pk,1,1,dwh_stage.s01_s_trfgrp,btree,id
4037,dwh_dds,dict_card_type,dict_crdtp_pk,1,1,dwh_dds.dict_card_type,btree,crdtp_cd


### Выравниваем Прод и Тест

In [75]:
prod2 = prod[['full_table_name']]
prod2.head()

Unnamed: 0,full_table_name
0,dwh_bckp.amara_corr_overdues
1,dwh_bckp.amara_rating_fitch
2,dwh_bckp.amara_rating_fitch_forecast
3,dwh_bckp.amara_rating_fitch_short
4,dwh_bckp.amara_rating_hist


In [76]:
test2 = test[['full_table_name']]
test2.head()

Unnamed: 0,full_table_name
0,dwh_bckp.amara_rating_fitch
1,dwh_bckp.amara_rating_fitch_forecast
2,dwh_bckp.amara_rating_fitch_short
3,dwh_bckp.amara_rating_hist
4,dwh_bckp.amara_rating_moodys


In [77]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 594 entries, 0 to 11540
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   full_table_name  594 non-null    object
dtypes: object(1)
memory usage: 9.3+ KB


Готовим список нужных имен таблиц (которые есть и на Прод и на Тест)

In [78]:
prod2 = prod[['full_table_name']]
prod2.head(2)

Unnamed: 0,full_table_name
0,dwh_bckp.amara_corr_overdues
1,dwh_bckp.amara_rating_fitch


In [79]:
test2 = test[['full_table_name']]
test2.head(2)

Unnamed: 0,full_table_name
0,dwh_bckp.amara_rating_fitch
1,dwh_bckp.amara_rating_fitch_forecast


In [80]:
prod2_test2_merge = prod2.merge(test2, on='full_table_name', how='inner')
prod2_test2_merge.head(2)

Unnamed: 0,full_table_name
0,dwh_bckp.amara_rating_fitch
1,dwh_bckp.amara_rating_fitch_forecast


In [81]:
#Эти таблицы есть и на Прод и на Тест. Остальные будем удалять.
not_to_delete = list(prod2_test2_merge.full_table_name.value_counts().index)

In [82]:
not_to_delete[:10] #Первые 10 элементов

['dwh_dds.ref_sc_issue',
 'dwh_dds.ref_deal_forex',
 'dwh_dds.ref_sc_lot',
 'dwh_dds.ref_sc_ticket',
 'dwh_dds.ref_account',
 'dwh_dds.ref_deal_scissue',
 'dwh_dds.ref_gen_agreement',
 'dwh_dds.acc_sc_quote',
 'dwh_dds.ref_sc_rating',
 'dwh_dds.ref_pledge_object']

Теперь удаляем строки в Прод и Тест, не входящие в список по полю full_table_name

Прод

In [83]:
indices = prod[~(prod['full_table_name'].isin(not_to_delete))].index
prod.drop(indices, inplace=True)

Тест

In [84]:
indices = test[~(test['full_table_name'].isin(not_to_delete))].index
test.drop(indices, inplace=True)

### Теперь наборы данных Прод и Тест содержат одинаковые таблицы и готовы к сравнению инднксов

In [85]:
len(prod)

573

In [86]:
len(test)

566

Сравнение по количеству

In [87]:
table_names = set()

In [88]:
def check_indexex_quantity(row):
     
    if row['inds_count'] != test[test['full_table_name'] == row['full_table_name']].inds_count.max():
        table_names.add(row['full_table_name'])

In [89]:
prod.apply(lambda r: check_indexex_quantity(r), axis=1)

1       None
2       None
3       None
4       None
5       None
        ... 
7955    None
7956    None
7957    None
7958    None
7959    None
Length: 573, dtype: object

In [90]:
table_names

{'dwh_dds.acc_doc_payment',
 'dwh_dds.ref_anl_account',
 'dwh_dds.ref_deal_scissue',
 'dwh_dds.ref_interbank_deals',
 'dwh_dds.ref_sc_issue',
 'dwh_dds.ref_sc_rating',
 'dwh_dds.ref_sc_ticket',
 'dwh_dm.dm_transaction',
 'dwh_stage.s01_ansignval',
 'dwh_stage.s01_g_accblnhst',
 'dwh_stage.s01_s_ordcash_add',
 'dwh_stage.s01_t_deashdhst',
 'dwh_stage.s01_t_operjrn'}