In [1]:
import os
import pandas as pd

import nltk
nltk.download('punkt_tab')  # This line is only needed the first time you run the code
nltk.download('stopwords') # This line is only needed the first time you run the code

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lopezgg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data loading

In [2]:
data_dir = "../../data/cancer_type"

In [3]:
df_train = pd.read_csv(
    os.path.join(data_dir, "train_tcga_reports_cancer_type.csv"),
    sep=',',
    header=0
)

In [4]:
df_train.shape

(4761, 4)

In [5]:
arr_train_corpus = df_train["text"].values.tolist()

In [6]:
len(arr_train_corpus)

4761

# Text analysis

In [7]:
example_text = arr_train_corpus[0]

In [8]:
len(example_text)

2399

In [9]:
print(example_text)

Date of Recelpt: Clinical Diagnosis & History: Incidental 3 cm left upper pole renal mass. Specimens Submitted: 1: Kidney, Left Upper Pole; Partial Nephrectomy. DIAGNOSIS: 1. Kidney, Left Upper Pole; Partial Nephrectomy: Tumor Type: Renal cell carcinoma - Conventional (clear cell) type. Fuhrman Nuclear Grade: Nuclear grade II/IV. Tumor Size: Greatest diameter is 2.4 cm. Local Invasion (for renal cortical types): Not Identified. Renal Vein Invasion: Not identified. Surgical Margins: Free of tumor. Non-Neoplastic Kidney: shows focal chronic inflammation and focal superficial glomerulosclerosis. Adrenal Gland: Not identified. Lymph Nodes: Not identified. Staging for renal cell carcinoma/oncocytoma: pT1 Tumor <= 7.0 cm in greatest dimension limited to the kidney. JATTEST THAT THE ABOVE DIAGNOSIS IS BASED UPON MY PERSONAL EXAMINATION OF THE SLIDES (AND/OR OTHER MATERIAL), AND THAT IHAVE. REVIEWED AND APPROVED THIS REPORT. Gross Description: 1). The specimen Is received fresh for frozen sect

In [10]:
# This is a simple tokenizer that splits on whitespace
print(example_text.split())

['Date', 'of', 'Recelpt:', 'Clinical', 'Diagnosis', '&', 'History:', 'Incidental', '3', 'cm', 'left', 'upper', 'pole', 'renal', 'mass.', 'Specimens', 'Submitted:', '1:', 'Kidney,', 'Left', 'Upper', 'Pole;', 'Partial', 'Nephrectomy.', 'DIAGNOSIS:', '1.', 'Kidney,', 'Left', 'Upper', 'Pole;', 'Partial', 'Nephrectomy:', 'Tumor', 'Type:', 'Renal', 'cell', 'carcinoma', '-', 'Conventional', '(clear', 'cell)', 'type.', 'Fuhrman', 'Nuclear', 'Grade:', 'Nuclear', 'grade', 'II/IV.', 'Tumor', 'Size:', 'Greatest', 'diameter', 'is', '2.4', 'cm.', 'Local', 'Invasion', '(for', 'renal', 'cortical', 'types):', 'Not', 'Identified.', 'Renal', 'Vein', 'Invasion:', 'Not', 'identified.', 'Surgical', 'Margins:', 'Free', 'of', 'tumor.', 'Non-Neoplastic', 'Kidney:', 'shows', 'focal', 'chronic', 'inflammation', 'and', 'focal', 'superficial', 'glomerulosclerosis.', 'Adrenal', 'Gland:', 'Not', 'identified.', 'Lymph', 'Nodes:', 'Not', 'identified.', 'Staging', 'for', 'renal', 'cell', 'carcinoma/oncocytoma:', 'pT1',

In [11]:
# This is a more sophisticated tokenizer that uses NLTK's word_tokenize
print(word_tokenize(example_text))

['Date', 'of', 'Recelpt', ':', 'Clinical', 'Diagnosis', '&', 'History', ':', 'Incidental', '3', 'cm', 'left', 'upper', 'pole', 'renal', 'mass', '.', 'Specimens', 'Submitted', ':', '1', ':', 'Kidney', ',', 'Left', 'Upper', 'Pole', ';', 'Partial', 'Nephrectomy', '.', 'DIAGNOSIS', ':', '1', '.', 'Kidney', ',', 'Left', 'Upper', 'Pole', ';', 'Partial', 'Nephrectomy', ':', 'Tumor', 'Type', ':', 'Renal', 'cell', 'carcinoma', '-', 'Conventional', '(', 'clear', 'cell', ')', 'type', '.', 'Fuhrman', 'Nuclear', 'Grade', ':', 'Nuclear', 'grade', 'II/IV', '.', 'Tumor', 'Size', ':', 'Greatest', 'diameter', 'is', '2.4', 'cm', '.', 'Local', 'Invasion', '(', 'for', 'renal', 'cortical', 'types', ')', ':', 'Not', 'Identified', '.', 'Renal', 'Vein', 'Invasion', ':', 'Not', 'identified', '.', 'Surgical', 'Margins', ':', 'Free', 'of', 'tumor', '.', 'Non-Neoplastic', 'Kidney', ':', 'shows', 'focal', 'chronic', 'inflammation', 'and', 'focal', 'superficial', 'glomerulosclerosis', '.', 'Adrenal', 'Gland', ':', '

In [12]:
# We concatenate all the documents into a single string, which represents the entire corpus
corpus = ' '.join(arr_train_corpus)

In [13]:
len(corpus)

17375130

In [14]:
arr_tok_coprus = word_tokenize(corpus)

In [15]:
# Number of tokens in the corpus
len(arr_tok_coprus)

3479265

In [16]:
# Top-20 most frequent tokens in the corpus
pd.Series(arr_tok_coprus).value_counts().head(20)

.        426200
:        129098
,        117620
of        64418
the       57700
and       56069
is        49493
)         47540
(         43714
cm        43426
in        38111
The       35766
x         32805
-         28938
lymph     26174
with      24164
to        23388
a         23353
tumor     21413
1         20650
Name: count, dtype: int64

In [17]:
# Vocabulary size (number of unique tokens in the corpus)
len(set(arr_tok_coprus))

55499

# BoW

In [18]:
arr_stopwords = nltk.corpus.stopwords.words('english')

In [19]:
print(arr_stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [20]:
def tokenizer(text):
    """
    Tokenizes the input text (lowercased) using NLTK's word_tokenize function and removes
    stopwords, punctuation, tokens containing non-alphabetic characters, and tokens with length 1.
    
    Args:
        text (str): The input text to tokenize.
        
    Returns:
        list: A list of tokens.
    """
    arr_tokens = word_tokenize(text.lower())
    # Remove tokens with length 1
    arr_tokens = [token for token in arr_tokens if len(token) > 1]
    # Remove stopwords
    arr_tokens = [token for token in arr_tokens if token not in arr_stopwords]
    # Remove punctuation and tokens containing non-alphabetic characters
    arr_tokens = [token for token in arr_tokens if token.isalpha()]
    return arr_tokens

In [21]:
word_vectorizer = CountVectorizer(
    tokenizer=tokenizer,
    token_pattern=None,
    lowercase=False,
    stop_words=None
)

In [22]:
word_vectorizer.fit(arr_train_corpus)

In [23]:
# Vocabulary size
len(word_vectorizer.vocabulary_)

23818

In [24]:
arr_train_bow = word_vectorizer.transform(arr_train_corpus)

In [25]:
arr_train_bow.shape

(4761, 23818)