# Preprocessing for NLP Breast Cancer Dataset

In [1]:
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
data = pd.read_excel('drive/MyDrive/June/Supplementary2_V2_clean.xlsx')
# Overview of columns in dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10045 entries, 0 to 10044
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DB_ID              10045 non-null  object 
 1   DISEASE            10042 non-null  object 
 2   DIS_CLASS          10045 non-null  object 
 3   GENE               10045 non-null  object 
 4   PUBMED.ID          10045 non-null  int64  
 5   LACKASSO           10045 non-null  object 
 6   TITLE              2676 non-null   object 
 7   YEAR               10045 non-null  int64  
 8   CONCLUSION         8502 non-null   object 
 9   REF_SENTENCE       10045 non-null  object 
 10  ASSOCIATION_CLASS  10045 non-null  object 
 11  REF_GENE           10045 non-null  object 
 12  GENE_NEW           10045 non-null  object 
 13  WEIGHT             10045 non-null  float64
dtypes: float64(1), int64(2), object(11)
memory usage: 1.1+ MB


## Concatenate sentence + gene columns

In [3]:
data['REF_SENTENCE_GENE'] = data[['REF_SENTENCE', 'REF_GENE']].apply(' '.join, axis=1)
data['REF_SENTENCE_GENE'].values[0]

'No differences in the patterns of L-myc RFLP were found between breast cancer patients and healthy individuals. L-myc'

## Preprocessing steps:

### 1 Removing punctuation

In [4]:
# Removing punctuation of all rows
data['REF_SENTENCE_GENE_nopunct'] = data['REF_SENTENCE_GENE'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
# First reference sentence with and w/o punctuation
print(data['REF_SENTENCE_GENE'].values[0])
print(data['REF_SENTENCE_GENE_nopunct'].values[0])

No differences in the patterns of L-myc RFLP were found between breast cancer patients and healthy individuals. L-myc
No differences in the patterns of Lmyc RFLP were found between breast cancer patients and healthy individuals Lmyc


### 2 Removing numbers

In [5]:
# Removing numbers
data['REF_SENTENCE_GENE_nopunct_nonum'] = [re.sub(r'\b\d+\b\s', '', str(x)) for x in data['REF_SENTENCE_GENE_nopunct']]
# First reference sentence with and w/o numbers and punctuation
print(data['REF_SENTENCE_GENE'].values[7])
print(data['REF_SENTENCE_GENE_nopunct'].values[7])
print(data['REF_SENTENCE_GENE_nopunct_nonum'].values[7])

Polymorphisms in the 5' region of DPF3 were associated with increased risk of breast cancer development, lymph node metastases, age of onset, and tumor size in women of European ancestry. DPF3
Polymorphisms in the 5 region of DPF3 were associated with increased risk of breast cancer development lymph node metastases age of onset and tumor size in women of European ancestry DPF3
Polymorphisms in the region of DPF3 were associated with increased risk of breast cancer development lymph node metastases age of onset and tumor size in women of European ancestry DPF3


### 3 Lower casing

In [6]:
# Lower casing all rows
data['REF_SENTENCE_GENE_nopunct_nonum_lower'] = data['REF_SENTENCE_GENE_nopunct_nonum'].str.lower()
# First reference sentence with and w/o lowercasing
print(data['REF_SENTENCE_GENE_nopunct_nonum'].values[7])
print(data['REF_SENTENCE_GENE_nopunct_nonum_lower'].values[7])

Polymorphisms in the region of DPF3 were associated with increased risk of breast cancer development lymph node metastases age of onset and tumor size in women of European ancestry DPF3
polymorphisms in the region of dpf3 were associated with increased risk of breast cancer development lymph node metastases age of onset and tumor size in women of european ancestry dpf3


### 4 Removing stopwords

In [7]:
# Getting list of english stopwords
ENG_STOPWORDS = stopwords.words('english')
stopwords_to_keep = set(['any', 'are', 'aren\'t', 'both', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'don\'t', 'had', 'hadn\'t', 'has', 'hasn\'t',
                         'have', 'haven\'t', 'is', 'isn\'t', 'no', 'nor', 'not', 'some', 'was', 'wasn\'t', 'were', 'weren\'t'])

# Removing stopwords from all rows
data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop'] = data['REF_SENTENCE_GENE_nopunct_nonum_lower'].apply(lambda text: ' '.join([word for word in text.split() if word not in list(set(ENG_STOPWORDS) - stopwords_to_keep)]))

# First reference sentence with and w/o stopwords
print(data['REF_SENTENCE_GENE_nopunct_nonum_lower'].values[7])
print(data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop'].values[7])

polymorphisms in the region of dpf3 were associated with increased risk of breast cancer development lymph node metastases age of onset and tumor size in women of european ancestry dpf3
polymorphisms region dpf3 were associated increased risk breast cancer development lymph node metastases age onset tumor size women european ancestry dpf3


### 5 Stemming

In [8]:
# Stemmer object
stemmer = PorterStemmer()

# Stemming all rows
data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop_stem'] = data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop'].apply(lambda x: ' '.join([stemmer.stem(word) if word.isalpha() else word for word in x.split()]))

# First reference sentence with and w/o stemming
print(data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop'].values[7])
print(data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop_stem'].values[7])

polymorphisms region dpf3 were associated increased risk breast cancer development lymph node metastases age onset tumor size women european ancestry dpf3
polymorph region dpf3 were associ increas risk breast cancer develop lymph node metastas age onset tumor size women european ancestri dpf3


## Take a look at the data in vectorized form (bag-of-words)

In [None]:
# Vectorize preprocessed sentence + gene column
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(data['REF_SENTENCE_GENE_nopunct_nonum_lower_nostop_stem'])
vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
vectorized.toarray().shape

(10045, 10123)

In [None]:
# The word lmyc should appear twice in the first row (once in sentence, once in gene)
vectorized.toarray()[0][vectorizer.vocabulary_['lmyc']]

2

In [None]:
# It should appear only once when we only vectorize the sentence (not the gene)
vectorized2 = vectorizer.fit_transform(data['REF_SENTENCE'])
vectorized2.toarray()[0][vectorizer.vocabulary_['myc']]

1

## Save preprocessed data for later model use

In [None]:
data[['REF_SENTENCE_GENE', 'REF_SENTENCE_GENE_nopunct', 'REF_SENTENCE_GENE_nopunct_nonum', 'REF_SENTENCE_GENE_nopunct_nonum_lower', 'REF_SENTENCE_GENE_nopunct_nonum_lower_nostop', 'REF_SENTENCE_GENE_nopunct_nonum_lower_nostop_stem']]

Unnamed: 0,REF_SENTENCE_GENE,REF_SENTENCE_GENE_nopunct,REF_SENTENCE_GENE_nopunct_nonum,REF_SENTENCE_GENE_nopunct_nonum_lower,REF_SENTENCE_GENE_nopunct_nonum_lower_nostop,REF_SENTENCE_GENE_nopunct_nonum_lower_nostop_stem
0,No differences in the patterns of L-myc RFLP w...,No differences in the patterns of Lmyc RFLP we...,No differences in the patterns of Lmyc RFLP we...,no differences in the patterns of lmyc rflp we...,no differences patterns lmyc rflp were found b...,no differ pattern lmyc rflp were found breast ...
1,CYP1A1 is a gene of the cytochrome P-450 famil...,CYP1A1 is a gene of the cytochrome P450 family...,CYP1A1 is a gene of the cytochrome P450 family...,cyp1a1 is a gene of the cytochrome p450 family...,cyp1a1 is gene cytochrome p450 family has prop...,cyp1a1 is gene cytochrom p450 famili ha propos...
2,"However, mutational analysis of E-cadherin in ...",However mutational analysis of Ecadherin in mu...,However mutational analysis of Ecadherin in mu...,however mutational analysis of ecadherin in mu...,however mutational analysis ecadherin multiple...,howev mutat analysi ecadherin multipl foci car...
3,The findings provide no evidence for a role of...,The findings provide no evidence for a role of...,The findings provide no evidence for a role of...,the findings provide no evidence for a role of...,findings provide no evidence role comt val58me...,find provid no evid role comt val58met cyp1a12...
4,The findings provide no evidence for a role of...,The findings provide no evidence for a role of...,The findings provide no evidence for a role of...,the findings provide no evidence for a role of...,findings provide no evidence role comt val58me...,find provid no evid role comt val58met cyp1a12...
...,...,...,...,...,...,...
10040,The purpose of this study was to determine whe...,The purpose of this study was to determine whe...,The purpose of this study was to determine whe...,the purpose of this study was to determine whe...,purpose study was determine whether sipa1 c po...,purpos studi wa determin whether sipa1 c polym...
10041,"In this study, SIPA1 545 C > T polymorphism wa...",In this study SIPA1 545 C T polymorphism was ...,In this study SIPA1 C T polymorphism was dete...,in this study sipa1 c t polymorphism was dete...,study sipa1 c polymorphism was detected metast...,studi sipa1 c polymorph wa detect metastat bre...
10042,We found that SIPA1 545 C > T polymorphism was...,We found that SIPA1 545 C T polymorphism was ...,We found that SIPA1 C T polymorphism was sign...,we found that sipa1 c t polymorphism was sign...,found sipa1 c polymorphism was significantly a...,found sipa1 c polymorph wa significantli assoc...
10043,Our findings indicate that metastatic breast c...,Our findings indicate that metastatic breast c...,Our findings indicate that metastatic breast c...,our findings indicate that metastatic breast c...,findings indicate metastatic breast cancer pat...,find indic metastat breast cancer patient sipa...


In [None]:
data.to_csv('drive/MyDrive/June/Supplementary2_V2_preprocessed.xlsx', index=False)