In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Loading the data

In [3]:
train_data = pd.read_csv('./data/processed/train_set.txt').set_index('Unnamed: 0')
train_data
valid_data = pd.read_csv('./data/processed/valid_set.txt').set_index('Unnamed: 0')
test_data = pd.read_csv('./data/processed/test_set.txt').set_index('Unnamed: 0')
valid_data
test_data
tfidf_features = load_npz('./data/processed/tfidf_features.npz')
tfidf_features

Unnamed: 0_level_0,seq_id,medline_ui,mesh_terms,publication_type,author,source,sensitive,relevance_status,text_cleaned
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
55655,54711,88006849,Adult; Aged; Attitude of Health Personnel/*; A...,JOURNAL ARTICLE.,Karlik BA; Yarcheski A.,Heart Lung 8801; 16(5):544-51,0,,Learning needs cardiac patients: partial repli...
277600,54711,91012901,Allergy and Immunology/*; Human; Hypersensitiv...,NEWS.,Cotton P.,"JAMA 9101; 264(17):2184, 2187",0,,"New allergy society, formed advocate vitro lab..."
172713,54711,89248820,"Administration, Oral; Adult; Aged; Chronic Dis...",JOURNAL ARTICLE.,Thirlwell MP; Sloan PA; Maroun JA; Boos GJ; Be...,Cancer 8909; 63(11 Suppl):2275-83,0,,Pharmacokinetics clinical efficacy oral morphi...
277300,54711,91011780,Case Report; Female; Hemangioma/*PA; Human; Le...,JOURNAL ARTICLE; REVIEW; REVIEW OF REPORTED CA...,Ragbeer MS; Stone J.,J Oral Maxillofac Surg 9101; 48(10):1113-7,0,,Vascular leiomyoma nasal cavity: report case r...
132901,54711,89372453,"Delirium/*/ET; Human; Surgery, Operative/*.",JOURNAL ARTICLE.,Golinger RC.,Am Surg 8912; 55(9):549-51,0,,Delirium surgical patient. Delirium common oft...
...,...,...,...,...,...,...,...,...,...
125350,54711,88298584,Acetazolamide/*PD; Acid-Base Equilibrium/*DE; ...,JOURNAL ARTICLE.,Bickler PE; Litt L; Banville DL; Severinghaus JW.,J Appl Physiol 8811; 65(1):422-7,0,,Effects acetazolamide cerebral acid-base balan...
271319,54711,90225099,"Animal; Antibodies, Monoclonal; Antigens, Diff...",JOURNAL ARTICLE.,Steiniger B; Schroder D; Luck R; Luciano L; va...,Am J Pathol 9007; 136(4):967-78,0,,Gamma interferon treatment vivo provokes accum...
137903,54711,89297385,Adult; Amnesia/*CI/ET; Case Report; Cerebral A...,JOURNAL ARTICLE.,Giang DW; Kido DK.,Radiology 8910; 172(1):195-6,0,,Transient global amnesia associated cerebral a...
153639,54711,89268911,"Amino Acids, Branched-Chain/*TU; Human; Parent...","JOURNAL ARTICLE; REVIEW; REVIEW, TUTORIAL.",Teasley KM; Buss RL.,DICP 8909; 23(5):411-6,0,,parenteral nutrition solutions high concentrat...


Unnamed: 0_level_0,seq_id,medline_ui,mesh_terms,publication_type,author,source,sensitive,relevance_status,text_cleaned
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
55514,54711,88005219,"Human; Infertility, Male/*ET/PP; Male; Semen/*...",LETTER.,Puri VN.,Fertil Steril 8801; 48(4):702-3,1,,Smoking seminal fluid index [letter] demonstra...
12580,1,87155935,Abdomen/*SU; Adult; Aged; Cholecystectomy; Com...,JOURNAL ARTICLE.,Sirinek KR; Burk RR; Brown M; Levine BA.,Arch Surg 8706; 122(3):271-3,0,,Improving survival patients cirrhosis undergoi...
26334,1,87275123,Adult; Arrhythmia/*ET; Bronchogenic Cyst/CO/*R...,JOURNAL ARTICLE.,Watson AJ; Chaudhary BA.,Chest 8711; 92(2):335-6,0,,Cardiac arrhythmias abnormal chest roentgenogr...
95859,54711,88282905,Aluminum/AE/*BL; Brain Diseases/*CI/DI; Compar...,JOURNAL ARTICLE.,Rovelli E; Luciani L; Pagani C; Albonico C; Co...,Clin Nephrol 8811; 29(6):294-8,0,,Correlation serum aluminum concentration signs...
212429,54711,90089954,"Adolescence; Adult; Anus/PP/*SU; Colitis, Ulce...",JOURNAL ARTICLE.,Williams NS; Marzouk DE; Hallan RI; Waldron DJ.,Br J Surg 9004; 76(11):1168-71,0,,Function ileal pouch stapled pouch-anal anasto...
...,...,...,...,...,...,...,...,...,...
279024,54711,91020534,Cadaver; Comparative Study; Cyclosporins/*AE/T...,CLINICAL TRIAL; JOURNAL ARTICLE.,Berardinelli L; Raiteri M; Beretta C; Costanti...,Transplant Proc 9101; 22(5):2343-4,0,,Extrarenal pathology cyclosporine: lasting cha...
80895,54711,88160770,"Adult; Alcohol Drinking/*; Alcohol, Ethyl/BL; ...",JOURNAL ARTICLE.,Persson J; Magnusson PH.,Acta Med Scand 8806; 223(2):101-9,0,,Comparison different methods detecting patient...
294331,54711,91093899,"Anemia, Sickle Cell/*IM; Antibody Formation; B...",JOURNAL ARTICLE.,Marcinak JF; Frank AL; Labotka RL; Rao S; Draw...,J Pediatr 9104; 118(1):69-71,0,,Immunogenicity Haemophilus influenzae type b p...
129046,54711,89340252,Airway Resistance; Animal; Biguanides/*PD; Cap...,JOURNAL ARTICLE.,Haxhiu MA; Deal EC; Cherniack NS.,J Appl Physiol 8911; 67(1):203-9,0,,Influence respiratory drive airway responses e...


Unnamed: 0_level_0,seq_id,medline_ui,mesh_terms,publication_type,author,source,sensitive,relevance_status,text_cleaned
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
125,1,87049457,"Adolescence; Anesthesia, Inhalation/*AE; Child...",JOURNAL ARTICLE.,Wark H; O'Halloran M; Overton J.,Br J Anaesth 8703; 58(11):1224-8,0,not_relevant,Prospective study liver function children foll...
153,1,87049559,Female; Human; Pregnancy; Propranolol/TU; Puer...,JOURNAL ARTICLE.,Ramsay I.,Br J Obstet Gynaecol 8703; 93(11):1121-3,1,not_relevant,Postpartum thyroiditis--an underdiagnosed dise...
222,1,87050621,Actinomycosis/*CO; Adult; Case Report; Human; ...,JOURNAL ARTICLE.,McGibney D; Clarke PB.,Br J Urol 8703; 58(5):566,1,not_relevant,Primary renal actinomycosis presence horseshoe...
282,1,87051243,"Adult; Aged; Antineoplastic Agents, Combined/*...",JOURNAL ARTICLE.,Scheid V; Buzdar AU; Smith TL; Hortobagyi GN.,Cancer 8703; 58(12):2589-93,0,relevant,Clinical course breast cancer patients osseous...
299,1,87051269,Adult; Aged; Autopsy; Echocardiography; Electr...,JOURNAL ARTICLE.,Wadler S; Chahinian P; Slater W; Goldman M; Me...,Cancer 8703; 58(12):2744-50,0,not_relevant,Cardiac abnormalities patients diffuse maligna...
...,...,...,...,...,...,...,...,...,...
350107,54711,91368351,Female; Human; Isoantibodies/AN; Pregnancy; Rh...,LETTER.,Stroup M.,Transfusion 9112; 31(7):677-8,0,relevant,Controversies transfusion medicine: Du testing...
350205,54711,91374975,Antibodies/*AN; Antigenic Determinants; Blood ...,JOURNAL ARTICLE.,Frampton G; Hicks J; Cameron JS.,Kidney Int 9112; 39(6):1225-31,1,relevant,Significance anti-phospholipid antibodies pati...
350206,54711,91374975,Antibodies/*AN; Antigenic Determinants; Blood ...,JOURNAL ARTICLE.,Frampton G; Hicks J; Cameron JS.,Kidney Int 9112; 39(6):1225-31,1,relevant,Significance anti-phospholipid antibodies pati...
350240,54711,91376240,"Child Abuse; Child Day Care Centers/*; Child, ...",JOURNAL ARTICLE.,Bassoff BZ; Willis WO.,Public Health Rep 9112; 106(5):523-9,0,not_relevant,Requiring formal training preventive health pr...


<350274x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 21662129 stored elements in Compressed Sparse Row format>

## Exploring the data

In [5]:
tfidf_features.indices
tfidf_features.indptr
tfidf_features.format
tfidf_features.shape
tfidf_features.data

array([2790, 2328, 3239, ..., 3632,  667,  974])

array([       0,       54,      122, ..., 21661957, 21662021, 21662129])

'csr'

(350274, 5000)

array([0.2235799 , 0.07474592, 0.15994515, ..., 0.11581553, 0.06717378,
       0.04023658])

In [6]:
train_data['relevance_status'].unique()

array([nan])

## Converting 5000 features into a DataFrame

In [8]:
features = pd.DataFrame(tfidf_features.toarray())
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.126639,0.0,0.169046,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350269,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350270,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350271,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350272,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Changing dtype to reduce memory usage

In [10]:
train_full = train_data[['sensitive']].join(features, how='left').astype('float32')
print('Train is joined')
valid_full = valid_data[['sensitive']].join(features, how='left').astype('float32')
print('Valid is joined')
test_full = test_data[['sensitive']].join(features, how='left').astype('float32')
print('Test is joined')

train_full

Train is joined
Valid is joined
Test is joined


Unnamed: 0_level_0,sensitive,0,1,2,3,4,5,6,7,8,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55655,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
277600,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.254654,0.0,0.0,0.0,0.0,0.0,0.0
172713,0.0,0.0,0.0,0.0,0.03247,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
277300,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
132901,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125350,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
271319,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
137903,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
153639,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


## Trying different hyperparams

In [12]:
# Logistic regression model
X_train = train_full.drop(columns=['sensitive'])
y_train = train_full['sensitive']
X_valid = valid_full.drop(columns=['sensitive'])
y_valid = valid_full['sensitive']

# Checking the memory usage
X_train.info()

for max_it in tqdm([100, 500]):
    estimator = LogisticRegression(max_iter=max_it)
    estimator.fit(X_train, y_train)
    print('F-1 score for {} max_iters is {}'.format(max_it, fbeta_score(y_valid, estimator.predict(X_valid), beta=1)))

<class 'pandas.core.frame.DataFrame'>
Index: 284013 entries, 55655 to 127517
Columns: 5000 entries, 0 to 4999
dtypes: float32(5000)
memory usage: 5.3 GB


  0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 1/2 [02:19<02:19, 139.24s/it]

F-1 score for 100 max_iters is 0.47124047124047125


100%|██████████| 2/2 [04:35<00:00, 137.61s/it]

F-1 score for 500 max_iters is 0.47124047124047125





Since there is no difference in metrics, it is more optimal to use 'lightweight' model with maximum amount of iterations equal to 100

## Testing the model on test data and calculating all the metrics

In [15]:
X_test = test_full.drop(columns=['sensitive'])
y_test = test_full['sensitive']

estimator = LogisticRegression(max_iter=100)
estimator.fit(X_train, y_train)
pred = estimator.predict(X_test)

results = pd.DataFrame(columns=['Precision', 'Recall', 'F-1', 'F-2', 'Accuracy'])
results.loc[0, 'Precision'] = precision_score(y_test, pred)
results.loc[0, 'Recall'] = recall_score(y_test, pred)
results.loc[0, 'F-1'] = fbeta_score(y_test, pred, beta=1)
results.loc[0, 'F-2'] = fbeta_score(y_test, pred, beta=2)
results.loc[0, 'Accuracy'] = accuracy_score(y_test, pred)

print('Resulting model perfomance:')
results

Resulting model perfoamce:


Unnamed: 0,Precision,Recall,F-1,F-2,Accuracy
0,0.834225,0.398366,0.539233,0.44485,0.91741
