In [2]:
import numpy as np
import os
import sys
import json
import pickle
import re
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

### Read Stopwords

In [3]:
stopwords = []
with open('../../data/meta/stopwords','r') as f:
    for line in f:
        stopwords.append(line.rstrip())

### Find Files

In [4]:
myfiles = []
classes = []
for root, dirs, files in os.walk("../../data/cancer_corpi/articles/", topdown=False):
    for name in files:
        if '.txt' in name:
            classes.append(name.split('/')[-1][:-4])
            myfiles.append(os.path.join(root, name))

### Read Files

In [5]:
records = []
labels = []
for m,myfile in enumerate(myfiles):
    with open(myfile,'r') as f:
        block = []
        prev_blocks = []
        for line in f:
            line = line.rstrip()
            if (len(line) == 0):
                prev_blocks.append(block)
                if (len(prev_blocks) > 5):
                    prev_blocks = prev_blocks[1:]
                if (len(block) > 5):
                    for p,prev_block in enumerate(prev_blocks):
                        for prev_line in prev_block:
                            if 'Author information:' == prev_line:
                                title = ' '.join(prev_blocks[p-2])
                                authors = ' '.join(prev_blocks[p-1])
                                abstract = ' '.join(block)
                                records.append((title,authors,abstract))
                                labels.append(classes[m])
                block = []
            else:
                block.append(line)

### Clean

In [6]:
records_clean = []
count = 0
for i in tqdm(range(len(records))):

    label = labels[i]
    title,authors,abstract = records[i]
    if '[Article' in title:
        pass
    else:
        
        #Title
        title = title.replace("'s","")
        title = title.replace("'","")
        title = re.sub(r'[^a-zA-Z]',' ', title)
        title = title.lower()
        words = title.split(' ')
        words = [word for word in words if len(word) > 0]
        words = [word for word in words if word not in stopwords]
        title = ' '.join(words)

        #Abatract
        abstract = abstract.replace("'s","")
        if 'leukaemia' in abstract:
            count = count + 1
        abstract = abstract.replace("'","")
        abstract = re.sub(r'[^a-zA-Z]',' ', abstract)
        abstract = abstract.lower()
        words = abstract.split(' ')
        words = [word for word in words if len(word) > 0]
        words = [word for word in words if word not in stopwords]
        abstract = ' '.join(words)

        #Auhor
        authors = re.sub(r'\([^)]*\)', '', authors)
        authors = authors.replace('.','')
        authors = authors.lower()
        authors = authors.split(',')

        #Store
        records_clean.append((title,abstract,authors,label))

  3%|███▉                                                                                                              | 1361/39787 [00:00<00:11, 3290.41it/s]

This paper reviews the Twenty-fourth Annual San Antonio Breast Cancer Symposium. The preliminary results of the ATAC study have shown that Arimidex is superior to tamoxifen in postmenopausal women with ER-positive early breast cancer in terms of DFS, adverse effects and prevention of contralateral breast cancer. However, longer follow up is required to assess the drug safety regarding bone mineral density and cognitive function. Letrozole seems to be superior to tamoxifen as a first-line therapy in ER-positive advanced breast cancer in postmenopausal women. Although the incidence of acute myeloid leukaemia is significantly increased (cumulative incidence at 5 years = 1.1%) in breast cancer patients receiving cyclophosphamide and anthracyclines, the risk of this complication is easily outweighed by the benefits of chemotherapy. Adjuvant clodronate was found to be associated with a significant reduction in the incidence of bone metastases during the treatment period. A randomised trial c

  8%|█████████▎                                                                                                        | 3252/39787 [00:01<00:12, 2888.97it/s]

Breast cancer histologies show important differences in their incidence pattern, method of detection and management. Aggregation of breast cancer occurs also in families diagnosed for cancer at sites different from the breast. Therefore, the familial association of histology specific breast cancers with cancers at other sites is of great interest. The nationwide Swedish Family-Cancer Database was used to calculate standardised incidence ratios (SIRs) for breast cancer when parents or sibling were diagnosed with cancer at the most common sites. Significant SIRs were found when parents had breast, ovarian, laryngeal, endometrial, prostate, lung and colon cancers. If women were diagnosed before the age of 50 years, the SIRs were significant when parents were diagnosed with breast, ovarian, and prostate cancers, and leukaemia, and when siblings were diagnosed with squamous cell skin, pancreatic, breast and endometrial cancers. If mothers were diagnosed with breast cancer, histology-specifi

 13%|██████████████▎                                                                                                   | 5012/39787 [00:01<00:12, 2881.68it/s]

Breast cancer survivors have increased risks of developing second primary cancers due to shared etiology, life style factors but also to primary breast cancer treatment. Among 53 418 patients registered by the population based Danish Breast Cancer Cooperative Group (DBCG) during 1977-2001, 31 818 patients were treated and followed according to guidelines of DBCG. In addition to surgery 23% received tamoxifen, 23% chemotherapy and 35% radiotherapy as treatment for primary breast cancer. Second primary cancers were identified by linkage to the population based Danish Cancer Register. Cancer incidence rates of the Danish population were used for calculation of standardized incidence ratios (SIRs). Time at risk was from diagnosis of breast cancer+1 year until death or through 2002. Risk for all second primary cancers combined was increased, SIR=1.04 (95% confidence interval 0.99-1.08). Sites with significantly elevated risks included corpus uteri (SIR=1.23), ovary (1.39), soft tissues (2.2

 16%|█████████████████▋                                                                                                | 6167/39787 [00:02<00:11, 2825.39it/s]

OBJECTIVE: To update a study of risks for leukaemia, brain cancer and breast cancer in a Danish nationwide, population-based cohort of utility employees. METHODS: A multivariate statistical model including information on age, duration of employment, date of first employment and level of occupational exposure to electromagnetic fields was applied. RESULTS: No increased risk for these cancers was seen among 28,224 subjects with more than 3 months of employment in whom cancer had not been diagnosed before first employment. CONCLUSION: The results do not support the hypothesis of an association between occupational exposure to magnetic fields in the electric utility industry and risks for leukaemia, brain cancer and breast cancer.
Chronic lymphocytic leukaemia (CLL) is the most common leukaemia among the adults in the Western World. CLL (and the corresponding nodal entity small lymphocytic lymphoma, SLL) is classified as a lymphoproliferative disorder characterised by the relentless accumu

 17%|███████████████████▎                                                                                              | 6730/39787 [00:02<00:12, 2713.70it/s]

Haemorrhagic diathesis is the commonest cause of morbidity and mortality in acute leukaemias (AL). It is most commonly due to thrombocytopenia resulting from bone marrow failure. However, in a significant number of cases, disseminated intravascular coagulation (DIC) plays an important part. Previously it was thought that this mechanism was mainly confined to acute promyelocytic leukaemia (APL), but recently it has also been reported to occur in other subtypes of acute leukaemia. We report the results of a study carried out to find the incidence of DIC in various types of AL at the time of first diagnosis and in the absence of other recognisable causes. DIC was observed in 14(13.4%) cases out of 104 cases of AL studied. Nine out of 49(18.4%) cases of AML and 5 out of 55(9.1%) cases of acute lymphoblastic leukaemia (ALL) showed coagulation abnormalities consistent with DIC. Out of the 9 cases of AML showing DIC, 63 (66.67%) belonged to APL (FAB ME) subtype. Three (60%) out of 5 cases of 

 18%|████████████████████▊                                                                                             | 7275/39787 [00:02<00:12, 2678.27it/s]

Over the last 15 years, infrared (IR) spectroscopy has developed into a novel and powerful biomedical tool that has multiple applications in the field of haematology. By revealing subtle alterations in both the conformation and concentration of key macromolecules, such as DNA, protein and lipids, IR spectroscopy has been employed to investigate multiple aspects of leucocyte physiology. IR spectroscopy has been used, for example, to diagnose and prognose leukaemia; to characterise differentiation and apoptotic processes; to predict drug sensitivity and resistance in leukaemic patients undergoing chemotherapy; to monitor the response of leucocytes to chemotherapy and to perform human leucocyte antigen matching for bone marrow transplant patients. Such studies have provided insight into pathogenic mechanisms underlying specific leucocyte disorders, especially leukaemia. While it is likely to be some considerable time before IR spectroscopy is sufficiently developed to displace the establi

 20%|██████████████████████▍                                                                                           | 7849/39787 [00:02<00:11, 2777.24it/s]

STUDY OBJECTIVE: The aim was to test a large set of childhood leukaemia and lymphoma registrations for the presence of clusters in space and in time. DESIGN: The study was a space-time cluster analysis. SETTING: England, Wales and Scotland. PATIENTS: All registrations for leukaemia and lymphoma between 1966 and 1983 in children aged 0 to 14 years were examined. The records included date and age of registration, sex, diagnosis, and the map reference of the postcode of residence. Of the 9411 registrations, 8888 were suitable for inclusion. MAIN RESULTS: There was a statistically significant excess of case pairs occurring jointly within 0.5 km and 60 d of each other: 68 pairs compared with 50.0 expected. The excess was detectable in central England, in the north of England and Scotland, but not in the south west of England. It was concentrated within the age band 4 to 7 years and among the lymphatic leukaemias. Several potential artefacts were considered and excluded, but the possibility 

 21%|████████████████████████                                                                                          | 8404/39787 [00:02<00:11, 2746.14it/s]

There has been no documented increase in childhood leukaemia following the Chernobyl accident. However, different forms of childhood leukaemia may not be equally susceptible to radiation carcinogenesis. Infant leukaemia is a distinct form associated with a specific genetic abnormality. Outside the former Soviet Union, contamination resulting from the Chernobyl accident has been highest in Greece and Austria and high also in the Scandinavian countries. All childhood leukaemia cases diagnosed throughout Greece since 1 January 1980 have been recorded. Here we report that infants exposed in utero to ionizing radiation from the Chernobyl accident had 2.6 times the incidence of leukaemia compared to unexposed children (95% confidence interval, 1.4 to 5.1; P approximately 0.003), and those born to mothers residing in regions with high radioactive fallout were at higher risk of developing infant leukaemia. No significant difference in leukaemia incidence was found among children aged 12 to 47 

 23%|██████████████████████████▌                                                                                       | 9250/39787 [00:03<00:11, 2769.90it/s]

Mercapturic acid pathway metabolites of phenylethyl isothiocyanate inhibited the growth of human leukaemia 60 (HL60) cells in vitro. The adduct with L-cysteine, S-(N-phenylethylthiocarbamoyl)cysteine, was the most potent with strong antileukaemic activity: the median growth inhibitory concentration (GC50) value was 336 +/- 1 nM (N = 18) compared with GC50 values of the precursor formed from dietary glucosinolates, phenylethyl isothiocyanate, 1.49 +/- 0.01 microM (N = 8), and the initial mercapturic acid pathway metabolite S-(N-phenylethylthiocarbamoyl)glutathione 5.46 +/- 0.36 microM (N = 18). S-(N-Benzylthiocarbamoyl)cysteine and S-(N-phenylpropylthiocarbamoyl)cysteine also had antiproliferative activity but S-(N-phenylethylthiocarbamoyl)cysteine was the most potent compound studied. The latter induced DNA fragmentation in HL60 cells but DNA laddering characteristic of apoptosis was not observed. It had low toxicity to corresponding differentiated cells, neutrophils, in culture, and t

 25%|████████████████████████████                                                                                      | 9802/39787 [00:03<00:11, 2711.52it/s]

Peripheral blood specimens, obtained from 71 patients with newly-diagnosed acute non-lymphoblastic leukaemia (ANLL) prior to the initiation of therapy, were assayed for the presence of a myeloid leukaemia-associated cell surface antigen identified by monoclonal antibody YB5.B8. The antibody bound to cells from 22 patients, and these patients had a poorer overall survival rate than those whose cells failed to bind the antibody (p less than 0.025). Fifty patients were treated with daunorubicin/cytosine arabinoside/6-thioguanine (DAT) according to a standard protocol and survived at least to the end of the induction phase (7 days). Of the 34 patients whose cells were YB5.B8 negative, 28 obtained a complete remission. In contrast, only four of the 16 patients whose cells expressed YB5.B8 antigen obtained complete remission (p less than 0.001). Expression of the YB5.B8 antigen in ANLL appears to be a strong prognostic indicator which is independent of other known prognostic factors such as 

 26%|█████████████████████████████▍                                                                                   | 10347/39787 [00:03<00:10, 2684.49it/s]

We report three pregnancies with successful outcomes in two women following allogeneic bone marrow transplantation (BMT) for acute leukaemia using high dose melphalan alone as conditioning therapy. The increasing application and success of BMT together with the instigation of conditioning regimens that do not include total body irradiation should increase such cases. These and previous cases document that a normal outcome of pregnancy is likely in these patients.
We studied the kinetics of EBV-transformed B-cell lines from patients with chronic myeloid leukaemia (CML) using RT-PCR for BCR-ABL transcripts and immunoglobulin heavy chain (IgH) gene rearrangements. Peripheral blood mononuclear cells, obtained from four patients with CML in chronic phase and from one in accelerated phase, were incubated with supernatant from the B95-8 EBV producing cell line. In 11/25 (44%) B-cell cultures established we demonstrated the presence of BCR-ABL transcripts at intervals ranging from 32 to 125 d 

 44%|█████████████████████████████████████████████████▎                                                               | 17378/39787 [00:06<00:08, 2533.07it/s]

BACKGROUND: Oncogenic activation of the PI3K signalling pathway plays a pivotal role in the development of glioblastoma multiforme (GBM). A central node in PI3K downstream signalling is controlled by the serine-threonine kinase AKT1. A somatic mutation affecting residue E17 of the AKT1 gene has recently been identified in breast and colon cancer. The E17K change results in constitutive AKT1 activation, induces leukaemia in mice, and accordingly, may be therapeutically exploited to target the PI3K pathway. Assessing whether AKT1 is activated by somatic mutations in GBM is relevant to establish its role in this aggressive disease. METHODOLOGY/PRINCIPAL FINDINGS: We performed a systematic mutational analysis of the complete coding sequence of the AKT1 gene in a panel of 109 tumor GBM samples and nine high grade astrocytoma cell lines. However, no somatic mutations were detected in the coding region of AKT1. CONCLUSIONS/SIGNIFICANCE: Our data indicate that in GBM oncogenic deregulation of 

 50%|████████████████████████████████████████████████████████▍                                                        | 19867/39787 [00:07<00:07, 2668.66it/s]


KeyboardInterrupt: 

### Store

In [35]:
data = {}
data['records'] = records_clean
pickle.dump( data, open( "../../data/cancer_corpi/articles/pickle/processed.p", "wb"))