<a href="https://colab.research.google.com/github/flying-bear/2018-course-poster/blob/master/hierarchy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import sklearn
import scipy.stats
import time

from collections import Counter, Iterable
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from tqdm.notebook import tqdm

In [0]:
SEED = 42

In [0]:
tf_idf_lsa_50 = pd.read_csv('/content/drive/My Drive/job/Data/tf_idf_lsa_50000.csv', index_col=0)

In [0]:
tf_idf_lsa_50 = tf_idf_lsa_50.drop(['BIRTH_DATE'], axis=1)

In [0]:
tgt_50 = pd.read_csv('/content/drive/My Drive/job/Data/tgt_50000.csv', index_col=0)

In [0]:
diagnoses = pd.read_csv('/content/drive/My Drive/job/Data/data.csv', index_col=0, usecols=['ID_EMIAS', 'Основной диагноз']).dropna()

In [26]:
uniques = np.unique(diagnoses['Основной диагноз'])
uniques[:10]

array(['B95.7 - Другие стафилококки как причина болезней, классифицированных в других рубриках',
       'B95.8 - Неуточненные стафилококки как причина болезней, классифицированных в других рубриках',
       'E78.2 - Смешанная гиперлипидемия',
       'E78.5 - Гиперлипидемия неуточненная',
       'H60.0 - Абсцесс наружного уха', 'H60.1 - Целлюлит наружного уха',
       'H60.3 - Другие инфекционные наружные отиты',
       'H60.5 - Острый наружный отит неинфекционный',
       'H60.8 - Другие наружные отиты',
       'H60.9 - Наружный отит неуточненный'], dtype=object)

In [0]:
def process_code(string_code):
  """
  splits 'Основной диагноз' into a letter code, a number code, a digit code, 
                                 and a description
  :param str_code: str of format 'A12.3 - text text text'
  :return: pd.Series of
      :return letter: str, letter code, ICD-10 class
      :return number: int, number code, ICD-10 inter-class code
      :return digit: int, digit code, ICD-10 inter-disease code
      :return text: str, ICD-10 code name
  """
  string_code, text = string_code.split(' - ')
  letter, number = string_code[0], string_code[1:3]
  if '.' in string_code:
    digit = int(string_code.split('.')[-1])
  else:
    digit = np.nan
  return pd.Series([letter, number, digit, text])

In [28]:
print(uniques[0])
process_code(uniques[0])

B95.7 - Другие стафилококки как причина болезней, классифицированных в других рубриках


0                                                    B
1                                                   95
2                                                    7
3    Другие стафилококки как причина болезней, клас...
dtype: object

In [29]:
print(tgt_50.loc[16146942]['Основной диагноз'])
process_code(tgt_50.loc[16146942]['Основной диагноз'])

J00 - Острый назофарингит (насморк)


0                                J
1                               00
2                              NaN
3    Острый назофарингит (насморк)
dtype: object

In [30]:
tgt_50[['letter', 'number', 'digit', 'text']] = tgt_50['Основной диагноз'].apply(process_code)
tgt_50.tail()

Unnamed: 0_level_0,Основной диагноз,letter,number,digit,text
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16211073,I11.9 - Гипертензивная [гипертоническая] болез...,I,11,9.0,Гипертензивная [гипертоническая] болезнь с пре...
16211102,J04.1 - Острый трахеит,J,4,1.0,Острый трахеит
16211112,I11.9 - Гипертензивная [гипертоническая] болез...,I,11,9.0,Гипертензивная [гипертоническая] болезнь с пре...
16211116,J20.9 - Острый бронхит неуточненный,J,20,9.0,Острый бронхит неуточненный
16211117,J45.8 - Смешанная астма,J,45,8.0,Смешанная астма


In [0]:
def concat_code(xs):
  if len(xs) == 3:
    letter, number, digit = xs
  elif len(xs) == 2:
    letter, number = xs
    digit = np.nan
  if not pd.isnull(digit):
    digit = str(int(digit))
  else:
    digit = ''
  return int(''.join([str(letter), number, digit]))

In [32]:
le = LabelEncoder()
tgt_50['letter'] = le.fit_transform(tgt_50['letter'].values)
tgt_50['concat_code'] = tgt_50[['letter', 'number', 'digit']].apply(concat_code, axis=1)
tgt_50['concat_2'] = tgt_50[['letter', 'number']].apply(concat_code, axis=1)
tgt_50.tail()

Unnamed: 0_level_0,Основной диагноз,letter,number,digit,text,concat_code,concat_2
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16211073,I11.9 - Гипертензивная [гипертоническая] болез...,2,11,9.0,Гипертензивная [гипертоническая] болезнь с пре...,2119,211
16211102,J04.1 - Острый трахеит,3,4,1.0,Острый трахеит,3041,304
16211112,I11.9 - Гипертензивная [гипертоническая] болез...,2,11,9.0,Гипертензивная [гипертоническая] болезнь с пре...,2119,211
16211116,J20.9 - Острый бронхит неуточненный,3,20,9.0,Острый бронхит неуточненный,3209,320
16211117,J45.8 - Смешанная астма,3,45,8.0,Смешанная астма,3458,345


In [14]:
sorted_50000 = pd.concat([tgt_50[['letter', 'number', 'digit', 'concat_code', 'concat_2']], tf_idf_lsa_50], axis=1).sort_values('concat_code', ascending=False)
sorted_50000.tail()

Unnamed: 0_level_0,letter,number,digit,concat_code,concat_2,IS_MALE,Диагностический статус,Общее состояние,Нервно-психический статус,Уровень сознания,Наличие отеков,Степень отеков,Вес,Рост,ИМТ,Окружность талии,Температура,Пульс,Ритм,1ый тон,2ой тон,Шум в сердце,Сердечный толчок,Верхушечный толчок,Границы сердца,САД,ДАД,ЧДД,Носовое дыхание,Выделения из носа,Количество выделений из носа,Характер выделений из носа,Характер дыхания,Наличие хрипов,Аускультация легких,Влажность хрипов,Характеристика сухих крипов,Наличие крепитации,Шем трения плевры,Форма грудной клетки,...,Сопуствующие заболевания_tok_90,Сопуствующие заболевания_tok_91,Сопуствующие заболевания_tok_92,Сопуствующие заболевания_tok_93,Сопуствующие заболевания_tok_94,Сопуствующие заболевания_tok_95,Сопуствующие заболевания_tok_96,Сопуствующие заболевания_tok_97,Сопуствующие заболевания_tok_98,Сопуствующие заболевания_tok_99,Сопуствующие заболевания_tok_100,Сопуствующие заболевания_tok_101,Сопуствующие заболевания_tok_102,Сопуствующие заболевания_tok_103,Сопуствующие заболевания_tok_104,Сопуствующие заболевания_tok_105,Сопуствующие заболевания_tok_106,Сопуствующие заболевания_tok_107,Сопуствующие заболевания_tok_108,Сопуствующие заболевания_tok_109,Сопуствующие заболевания_tok_110,Сопуствующие заболевания_tok_111,Сопуствующие заболевания_tok_112,Сопуствующие заболевания_tok_113,Сопуствующие заболевания_tok_114,Сопуствующие заболевания_tok_115,Сопуствующие заболевания_tok_116,Сопуствующие заболевания_tok_117,Сопуствующие заболевания_tok_118,Сопуствующие заболевания_tok_119,Сопуствующие заболевания_tok_120,Сопуствующие заболевания_tok_121,Сопуствующие заболевания_tok_122,Сопуствующие заболевания_tok_123,Сопуствующие заболевания_tok_124,Сопуствующие заболевания_tok_125,Сопуствующие заболевания_tok_126,Сопуствующие заболевания_tok_127,Сопуствующие заболевания_tok_128,Сопуствующие заболевания_tok_129
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
16047412,2,10,,210,210,0,0,0,0,0,1,0,,,,,36.5,72.0,2,0,0,0,0,0,0,120.0,80.0,18.0,3,2,0,0,3,5,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15952909,2,10,,210,210,1,0,3,2,2,1,0,143.0,192.0,38.8,,36.6,80.0,2,0,0,2,0,0,0,180.0,130.0,16.0,3,2,0,0,3,5,0,0,0,2,3,3,...,0.000748,0.001988,0.002353,0.001542,-0.007723,-0.019279,-0.004143,0.004173,0.002288,-0.007629,-0.016567,-0.002275,-0.000241,-0.006125,-0.005418,0.001087,0.000671,-0.001132,-0.004499,0.000634,-0.003367,-0.002405,-0.005451,0.004602,0.000239,0.001758,0.001033,0.000247,0.001097,-0.000686,-0.000136,-0.00181,-0.002921,0.000693,0.00084,-0.002277,-0.000948,0.000169,-0.00138,0.000544
10280739,2,10,,210,210,0,0,3,2,2,0,0,90.0,160.0,35.2,,36.4,86.0,0,0,0,0,0,0,0,130.0,70.0,69.0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16059281,2,10,,210,210,1,0,0,0,0,0,0,,,,,,,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16066761,2,10,,210,210,0,0,3,0,0,0,0,,,,,,90.0,2,0,0,0,0,0,0,160.0,110.0,,0,0,0,0,3,5,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
sorted_50000.to_csv('/content/drive/My Drive/job/Data/sorted_diagnosis_50000.csv')

In [15]:
tgt_50.apply(lambda column: len(np.unique(column.values)), axis=0)

Основной диагноз     39
letter                4
number               18
digit               761
text                 39
concat_code          39
concat_2             19
dtype: int64

# Test

## test on vectorized 50k on the 1st level (4 classes)

In [0]:
target = tgt_50.letter.values

In [0]:
%time
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
tf_idf_lsa_50_median = imputer.fit_transform(tf_idf_lsa_50)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs


In [0]:
lsa_50_x_train, lsa_50_x_test, lsa_50_y_train, lsa_50_y_test = train_test_split(tf_idf_lsa_50_median, 
                                                                    target,
                                                                    test_size=0.2,
                                                                    random_state=SEED,
                                                                    stratify=target)

In [0]:
%%time
lsa_50_logreg_model = LogisticRegressionCV(Cs=[10, 1, 0.1], cv=3, penalty='l2', 
                             multi_class='multinomial', max_iter = 100,
                             scoring='f1_macro', n_jobs=-1, verbose=True, random_state=SEED)
lsa_50_logreg_model.fit(lsa_50_x_train, lsa_50_y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  7.3min finished


CPU times: user 2min 50s, sys: 5.32 s, total: 2min 55s
Wall time: 8min 44s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
def print_metrics(true, pred):
  acc = metrics.accuracy_score(true, pred)
  f1 = metrics.f1_score(true, pred, average='macro', zero_division=1)
  prec = metrics.precision_score(true, pred, average='macro', zero_division=1)
  rec = metrics.recall_score(true, pred, average='macro', zero_division=1)
  IoU = metrics.jaccard_score(true, pred, average='macro')
  print(f"accuracy:\t\t{acc}")
  print(f"f1-score macro:\t\t{f1}")
  print(f"precision macro:\t{prec}")
  print(f"recall macro:\t\t{rec}")
  print(f"jaccard macro:\t\t{IoU}")
  return acc, f1, prec, rec, IoU

In [0]:
lsa_50_y_pred = lsa_50_logreg_model.predict(lsa_50_x_test)
lsa_50_1st_res = print_metrics(lsa_50_y_test, lsa_50_y_pred)

accuracy:		0.9132264529058116
f1-score macro:		0.4145349321274522
precision macro:	0.7033169477569376
recall macro:		0.39292263959617824
jaccard macro:		0.3631683895097094


## test on vectorized 50k on the 2d level  (19 classes)

In [0]:
target = tgt_50.concat_2.values

In [0]:
lsa_50_x_train, lsa_50_x_test, lsa_50_y_train, lsa_50_y_test = train_test_split(tf_idf_lsa_50_median, 
                                                                    target,
                                                                    test_size=0.2,
                                                                    random_state=SEED,
                                                                    stratify=target)

In [0]:
%%time
lsa_50_logreg_model = LogisticRegressionCV(Cs=[10, 1, 0.1], cv=3, penalty='l2', 
                             multi_class='multinomial', max_iter = 100,
                             scoring='f1_macro', n_jobs=-1, verbose=True, random_state=SEED)
lsa_50_logreg_model.fit(lsa_50_x_train, lsa_50_y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 11.0min finished


CPU times: user 3min 54s, sys: 11.8 s, total: 4min 6s
Wall time: 13min 5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
lsa_50_y_pred = lsa_50_logreg_model.predict(lsa_50_x_test)
lsa_50_2d_res = print_metrics(lsa_50_y_test, lsa_50_y_pred)

accuracy:		0.6292585170340681
f1-score macro:		0.09960147951949605
precision macro:	0.7082666773889406
recall macro:		0.10083643006329301
jaccard macro:		0.06875737795521948


## test on 300k NA distribution 1st level (4 classes)

In [0]:
na_dist = pd.read_csv('/content/drive/My Drive/job/Data/na_dist.csv', index_col=0)
tgt_na = na_dist[['Основной диагноз']]
na_dist = na_dist.drop(['Основной диагноз'], axis=1)

In [0]:
start = time.time()
tgt_na[['letter', 'number', 'digit', 'text']] = tgt_na['Основной диагноз'].apply(process_code)
tgt_na = tgt_na[['letter', 'number', 'digit']]
print(f"elapsed: {time.time() - start}")
tgt_na.tail()

elapsed: 73.59701299667358


Unnamed: 0_level_0,letter,number,digit
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10155733,I,11,9
25156077,I,11,9
21367207,J,18,9
18992013,J,45,0
16993087,I,20,8


In [0]:
%time
le = LabelEncoder()
tgt_na['letter'] = le.fit_transform(tgt_na['letter'].values)
tgt_na['concat_code'] = tgt_na[['letter', 'number', 'digit']].apply(lambda xs: int(''.join([str(x) for x in xs])), axis=1)
tgt_na['concat_2'] = tgt_na[['letter', 'number']].apply(lambda xs: int(''.join([str(x) for x in xs])), axis=1)
tgt_na.tail()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,letter,number,digit,concat_code,concat_2
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10155733,2,11,9,2119,211
25156077,2,11,9,2119,211
21367207,3,18,9,3189,318
18992013,3,45,0,3450,345
16993087,2,20,8,2208,220


In [0]:
tgt_na.apply(lambda column: len(np.unique(column.values)), axis=0)

letter          4
number         19
digit           9
concat_code    69
concat_2       20
dtype: int64

In [0]:
target_na = tgt_na.letter.values

In [0]:
na_x_train, na_x_test, na_y_train, na_y_test = train_test_split(na_dist, 
                                                                target_na,
                                                                test_size=0.2,
                                                                random_state=SEED,
                                                                stratify=target_na)

In [0]:
%%time
na_logreg_model = LogisticRegressionCV(Cs=[10, 1, 0.1], cv=3, penalty='l2', 
                             multi_class='multinomial', max_iter = 100,
                             scoring='f1_macro', n_jobs=-1, verbose=True, random_state=SEED)
na_logreg_model.fit(na_x_train, na_y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished


CPU times: user 37.1 s, sys: 13.5 s, total: 50.7 s
Wall time: 2min 11s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
na_y_pred = na_logreg_model.predict(na_x_test)
na_1st_res = print_metrics(na_y_test, na_y_pred)

accuracy:		0.847983870967742
f1-score macro:		0.38853067931521945
precision macro:	0.9210724644095734
recall macro:		0.37458722095739605
jaccard macro:		0.32678575566699447


## test on 300k NA distribution 2d level (20 classes)

In [0]:
target_na = tgt_na.concat_2.values

In [0]:
na_x_train, na_x_test, na_y_train, na_y_test = train_test_split(na_dist, 
                                                                target_na,
                                                                test_size=0.2,
                                                                random_state=SEED,
                                                                stratify=target_na)

In [0]:
%%time
na_logreg_model = LogisticRegressionCV(Cs=[10, 1, 0.1], cv=3, penalty='l2', 
                             multi_class='multinomial', max_iter = 100,
                             scoring='f1_macro', n_jobs=-1, verbose=True, random_state=SEED)
na_logreg_model.fit(na_x_train, na_y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.2min finished


CPU times: user 1min 9s, sys: 15.8 s, total: 1min 25s
Wall time: 5min 8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
na_y_pred = na_logreg_model.predict(na_x_test)
na_2d_res = print_metrics(na_y_test, na_y_pred)

accuracy:		0.5911458333333334
f1-score macro:		0.12293372868992022
precision macro:	0.6663858872711726
recall macro:		0.1146623046980757
jaccard macro:		0.0811446596539939


# Organize hierarchical prediction


log reg probas

In [0]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])
# array([0, 0])
clf.predict_proba(X[:2, :])
# array([[9.8...e-01, 1.8...e-02, 1.4...e-08],
#        [9.7...e-01, 2.8...e-02, ...e-08]])
clf.predict_log_proba(X[:2, :])
clf.decision_function(X[:2, :])
clf.score(X, y)
# 0.97...
clf.classes_

1. **it's important to store data sorted by the 4-digit code so that the training chuncs are alwasy close together**

2. **it's important when fitting to train models by ordered keys in dict, not just by random keys, for the same reason**

##  architectural choices

- probas or log probas
- how do we treat NaNs in data
- how do we treat NaNs in tgt
  - modify and apply concat_code function
  - we can only have NaNs after some level and down, not ```level_1 = 1, level_2 = np.nan, level_3 = 3```
- **think how class imbalance influences probabilities of classes**
- how we define training hyperparameters

In [0]:
def concat_code(levels, concat_all=False):
  if type(levels) == pd.core.series.Series:
    index = levels.name
    levels = list(levels)
  if pd.isnull(levels[0]):   ## check that level_1 has no NaNs
    raise ValueError('The first level cannot be NaN.')
  str_levels = []
  for i, level in enumerate(levels):
    if type(level) == str and level.isdigit():
      str_levels.append(level)
    if type(level) == int:
      str_levels.append(str(level))
    if type(level) == float:
      if not pd.isnull(level):
        str_levels.append(str(int(level)))
      else:
      ## check that if level_x contains NaN for a given id, all level_x+m should also be NaN for this id
        if not all(pd.isnull(el) for el in levels[i:]):
          if index:
            index_str = f' The error occured at id {index}.'
          else:
            index_str = ''
          raise ValueError('A non-NaN value encounteres after the first NaN value. All values after a NaN level must be NaN.' + index_str)
        break
  if concat_all:
    return [int(''.join(str_levels))] * (len(levels) - len(str_levels)) + [int(''.join(str_levels[:i + 1])) for i in range(len(str_levels))][::-1]
  return int(''.join(str_levels))

In [329]:
concat_code(['1', 1, '00', 1.0, 32, np.nan, np.nan, np.nan])

1100132

In [330]:
concat_code(['1', 1, '00', 1.0, 32, np.nan, np.nan, np.nan], concat_all=True)

[1100132, 1100132, 1100132, 1100132, 11001, 1100, 11, 1]

In [331]:
tgt_50[['letter', 'number', 'digit']].apply(concat_code, axis=1, concat_all=True)

ID_EMIAS
217         [2119, 211, 2]
667         [2119, 211, 2]
858         [2251, 225, 2]
982         [2251, 225, 2]
1158        [3458, 345, 3]
                 ...      
16211073    [2119, 211, 2]
16211102    [3041, 304, 3]
16211112    [2119, 211, 2]
16211116    [3209, 320, 3]
16211117    [3458, 345, 3]
Length: 49899, dtype: object

In [0]:
def build_codes(target_df):
  """
  adds combinations of codes given codes df
  """
  ## check that level_1 has no NaNs
  levels = list(target_df.columns)
  first_level_na = target_df[levels[0]].isnull()
  if any(first_level_na):
    raise ValueError(f'The first level cannot be NaN, but found NaN at id(s) {np.where(first_level_na)[0]}')
  concat_levels = [f'concat_{i}' for i in range(len(levels), 0, -1)]
  target_df[concat_levels] = target_df.apply(lambda row: pd.Series((concat_code(row, concat_all=True))), axis=1)
  return target_df.sort_values(concat_levels[0])

In [333]:
ex_df = pd.DataFrame([[1, '00', "2", '3'], [1, 2, np.nan, np.nan], ['23', 2, 0, np.nan]], columns = [1, 2, 3, 4])
build_codes(ex_df) 


Unnamed: 0,1,2,3,4,concat_4,concat_3,concat_2,concat_1
1,1,2,,,12,12,12,1
2,23,2,0.0,,2320,2320,232,23
0,1,0,2.0,3.0,10023,1002,100,1


In [347]:
test_tgt_50 = tgt_50[['letter',	'number',	'digit'	]].copy()
test_tgt_50.columns = ['level_1', 'level_2', 'level_3']
test_tgt_50 = build_codes(test_tgt_50)
test_tgt_50.tail()

Unnamed: 0_level_0,level_1,level_2,level_3,concat_3,concat_2,concat_1
ID_EMIAS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10843572,3,45,9.0,3459,345,3
16025401,3,45,9.0,3459,345,3
16155629,3,45,9.0,3459,345,3
10716225,3,45,9.0,3459,345,3
10488557,3,45,9.0,3459,345,3


In [0]:
def build_ids(tgt):
  ids = {}
  levels = [col_name for col_name in tgt.columns if col_name.startswith('concat_')][::-1]  # ['concat_1', 'concat_2', 'concat_3']
  for level in levels:
    uniques = np.unique(tgt[level])
    for code in uniques:
      ids[code] = tgt[tgt[level] == code].index
  return ids

In [411]:
%time
ids_ex = build_ids(test_tgt_50)
ids_ex[0]

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


Int64Index([10731940, 10838738, 10285631, 10652887, 16010993, 10390350,
            10657081, 10574950, 10738506, 10704005,    70239, 10663850,
            10106928, 10338518, 10298224, 10248207, 10918193, 10830892,
            10445821, 10647324, 10772864, 10323754, 10106356, 10122611,
            10217439, 16064479, 16171534, 15943259, 10752059, 10183772,
            10497504, 16161631, 10478130, 10181113, 10866362, 10670411,
            16051349, 10155399, 10841955, 10235455, 10130747, 10188946,
            16118064, 16050437, 10209399, 10353687, 10409473, 10113472,
            10808593, 10181860, 10761020, 10874349, 10516810, 10571581,
            10458145, 10783243, 10381444, 10596811, 16074359, 10854260,
            10302724, 10133877, 10758286, 10306046, 16006712, 10881374,
            16120545, 10186494, 15978508, 10106680, 10170033, 13785430,
            10908115, 10836662],
           dtype='int64', name='ID_EMIAS')

In [412]:
ids_ex.keys()

dict_keys([0, 1, 2, 3, 78, 160, 165, 210, 211, 212, 213, 220, 225, 250, 300, 302, 304, 315, 318, 320, 335, 344, 345, 782, 785, 1608, 1609, 1650, 2110, 2119, 2120, 2139, 2201, 2208, 2209, 2250, 2251, 2252, 2258, 2259, 2500, 2509, 3029, 3041, 3042, 3159, 3180, 3181, 3189, 3206, 3208, 3209, 3350, 3441, 3448, 3449, 3450, 3451, 3458, 3459])

In [0]:
ex_f = lambda df: {'x': len(df), 'y': len(df) * 2}

In [0]:
def build_hypers(ids, data, f):
    hypers = {}
    for code in ids.keys():
      hypers[code] = f(data.loc[ids[code]])
    return hypers

In [420]:
start = time.time()
ex_hypers = build_hypers(ids_ex, tf_idf_lsa_50, ex_f)
print(time.time() - start)
print(ex_hypers[0], ex_hypers[2])

5.010737180709839
{'x': 74, 'y': 148} {'x': 41016, 'y': 82032}


In [0]:
class hierarchical_model():
  """
  creates a hierarchial model from data with hierarchical traget and model type

  attributes:
    - data: pd.DataFrame, of int and float, of shape (num_examples, num_features)
    - tgt: pd.DataFrame, of int, float, or digits str; of shape (num_examples, 2 * num_levels - 1)
          level_1, ..., level_n, concat_2, ..., concat_n
    - model_c: class 
            class on __init__ taking data, tgt, {hyperparameters}, 
                          having fit and predict with probs
    - hyper_f: pd.DataFrame -> dict {hyperparameters}
    - ids: dict
        keys: digit codes (num_levels)
        values: indicies in data of this code - inidicies of the data chunk
    - hypers: dict
          keys: digit codes (num_levels)
          values: hyperparams for this level based on func from the data chunk
    - models: dict
          keys: digit codes (num_levels)
          values:  model for this level

  methods:
    - concat_codes (static)
    - build_codes (static)
    - build_ids
    - build_hypers
    - build_models
    - fit
    - predict 
  """


  def __init___(self, data_df, target_df, model_class, data_to_hyper_function):  ## what do we do with NaNs? some models cant work with them, some can
    """
    :param data_df: pd.DataFrame, of int and float, of shape (num_examples, num_features)
                    - cannot have duplicate ids
    :param target_df:  pd.DataFrame, of int, float, or digits str; of shape (num_examples, num_levels)
                      - ids of target_df & data_df should be identical
                      - cannot have duplicate ids
                      - order & names of columns: level_1, ..., level_n
                      - level_1 cannot contain NaNs
                      - if level_x contains NaN for a given id, all level_x+m should also be NaN for this id
    :param model_class: class, on __init__ taking data, tgt, {hyperparameters}, 
                        having methods fit() and predict() or transform() with probabilities
    :param data_to_hyper_function: function, pd.DataFrame -> dict of hyperparameters
    """
    ## check data for duplicte ids
    ## check tgt for duplicte ids
    ## check thata tgt & data have the same ids
    ## check that tgt and data have the same shape[0] num_examples
    self.data = data_df  
    self.tgt = target_df  ## build combined keys with concat code (modify for n length)
    self.model_c = model_class
    self.hyper_f = data_to_hyper_function
    self.ids = {}
    self.hypers = {}
    self.models = {}

  
  @staticmethod
  def concat_code(levels, concat_all=False):
    """
    concatenates a list of digit str, int or float to an int, stops at first NaN

    :param levels: lsit of digit str, int or float, levels to convert to int
                   - fisrt level cannot be NaN
                   - if level_x contains NaN for a given id, all level_x+m should also be NaN for this id
    :param concat_all: bool; whether to return one long concat or concat on every level
    :return: if concat_all: list of int; from the longest code, to the shortest code concat
             else: int, concatenated code
    """
    if type(levels) == pd.core.series.Series:
      index = levels.name
      levels = list(levels)
    if pd.isnull(levels[0]):   ## check that level_1 has no NaNs
      raise ValueError('The first level cannot be NaN.')
    str_levels = []
    for i, level in enumerate(levels):
      if type(level) == str and level.isdigit():
        str_levels.append(level)
      if type(level) == int:
        str_levels.append(str(level))
      if type(level) == float:
        if not pd.isnull(level):
          str_levels.append(str(int(level)))
        else:
        ## check that if level_x contains NaN for a given id, all level_x+m should also be NaN for this id
          if not all(pd.isnull(el) for el in levels[i:]):
            if index:
              index_str = f' The error occured at id {index}.'
            else:
              index_str = ''
            raise ValueError('A non-NaN value encounteres after the first NaN value. All values after a NaN level must be NaN.' + index_str)
          break
    if concat_all:
      return [''.join(str_levels)] * (len(levels) - len(str_levels)) + [''.join(str_levels[:i + 1]) for i in range(len(str_levels))][::-1]
    return int(''.join(str_levels))


  @staticmethod
  def build_codes(target_df):
    """
    adds combinations of codes given codes df

    :param target_df: pd.DataFrame, of int, float, or digits str; of shape (num_examples, num_levels)
                      - cannot have duplicate ids
                      - order & names of columns: level_1, ..., level_n
                      - level_1 cannot contain NaNs
                      - if level_x contains NaN for a given id, all level_x+m should also be NaN for this id

    :return: pd.DataFrame, of int, float, or digits str; of shape (num_examples, 2 * num_levels - 1),
              adds concatenated codes, sorts by the longest code sequence
    """
    levels = list(target_df.columns)
    first_level_na = target_df[levels[0]].isnull()
    if any(first_level_na): ## check that level_1 has no NaNs
      raise ValueError(f'The first level cannot be NaN, but found NaN at id(s) {np.where(first_level_na)[0]}')
    concat_levels = [f'concat_{i}' for i in range(len(levels), 0, -1)]
    target_df[concat_levels] = target_df.apply(lambda row: pd.Series((concat_code(row, concat_all=True))), axis=1)
    return target_df.sort_values(concat_levels[0])

  
  def build_ids(self):
    """
    creates a dictionary of class codes to relevant data ids

    for each level of concatenation creates a dict entry:
      key - code (int)
      value: np.array of ids in the data that have that code combination as tgt
    
    :return: self
    """
    self.ids = {}
    levels = [col_name for col_name in self.tgt.columns if col_name.startswith('concat_')][::-1]  # ['concat_1', 'concat_2', 'concat_3']
    for level in levels:
      uniques = np.unique(self.tgt[level])
      for code in uniques:
        self.ids[code] = self.tgt[self.tgt[level] == code].index
    return self

  
  def build_hypers(self):
    """
    apply hyper_f to relevant data on every level to obtain hyperparameters for every level
    """
    for code in self.ids.keys():
      pass
    return self

  
  def build_models(self):
    """
    init models on every level with relevant hyperparameters
    """
    pass


  def fit(self): # how we define training hyperparameters?
    """
    fit models on every level with relevant data
    """
    pass


  def predict(self, X, beam_width=5, output='top n'):
    """
    predict a class for each classification level using beamsearch on class probabilities

    :param X: pd.DataFrame, of int and float, of shape (num_examples, num_features)
    :param beam_width: int, beam width for beam search,
                       optional, deafult 5
    :param oputput: str {'top n', 'class', 'prob'}:
                  - 'top n': return np.array of shape (num_examples, beam_width),
                             predict top n most probable classes
                  - 'class': return np.array of shape (num_examples, 1),
                             predict the most probable class
                  - 'prob': return np.array of shape (num_examples, beam_width, beam_width),
                             predict top n most probable classes and their probabililties
    """
    pass