In [1]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
data_df = pd.read_csv("./data/input/train.csv")
data_df.head()

Unnamed: 0,uuid,title,author,abstract,Keywords,label
0,0,Accessible Visual Artworks for Blind and Visua...,"Quero, Luis Cavazos; Bartolome, Jorge Iranzo; ...",Despite the use of tactile graphics and audio ...,accessibility technology; multimodal interacti...,0
1,1,Seizure Detection and Prediction by Parallel M...,"Li, Chenqi; Lammie, Corey; Dong, Xuening; Amir...","During the past two decades, epileptic seizure...",CNN; Seizure Detection; Seizure Prediction; EE...,1
2,2,Fast ScanNet: Fast and Dense Analysis of Multi...,"Lin, Huangjing; Chen, Hao; Graham, Simon; Dou,...",Lymph node metastasis is one of the most impor...,Histopathology image analysis; computational p...,1
3,3,Long-Term Effectiveness of Antiretroviral Ther...,"Huang, Peng; Tan, Jingguang; Ma, Wenzhe; Zheng...",In order to assess the effectiveness of the Ch...,HIV; ART; mortality; observational cohort stud...,0
4,4,Real-Time Facial Affective Computing on Mobile...,"Guo, Yuanyuan; Xia, Yifan; Wang, Jing; Yu, Hui...",Convolutional Neural Networks (CNNs) have beco...,facial affective computing; convolutional neur...,0


In [3]:
data_df.drop(columns=['uuid', 'title', 'author', 'Keywords'], inplace=True, axis=1)
print(data_df.shape)

(6000, 2)


In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [7]:
import re
import numpy as np

do_remove_punct = True
do_lemmatize = False
do_lowercase = True
do_remove_stop = True
do_remove_nums = True

def custom_tokenize(row):
    text = row['abstract']
    
    # 01 - Punctuations
    if do_remove_punct:
        text = re.sub(r'([^\w\s])', ' ', text)

    # 02 - Lemmatization
    if do_lemmatize:
        tokens_list = nltk.word_tokenize(text)
        text = ' '.join([lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens_list])

    # 03 - Lowercasing
    if do_lowercase:
        text = text.lower()

    # 04 - Removing stop words (i.e. grammar defining words, not adding value to main topic)
    if do_remove_stop:
        text = ' '.join([t for t in text.split() if t not in stopwords])

    # 05 - Removing numbers
    if do_remove_nums:
        text = re.sub(r'\b[0-9]+\b', ' ', text)

    # Removing redundant spaces
    text = re.sub(r' +', ' ', text)

    row['custom_tokenized'] = text.split()
    return row

data_df['custom_tokenized'] = [list() for _ in range(data_df.shape[0])]
data_df = data_df.apply(custom_tokenize, axis=1)

data_df.head(20)

Unnamed: 0,abstract,label,custom_tokenized
0,Despite the use of tactile graphics and audio ...,0,"[despite, use, tactile, graphics, audio, guide..."
1,"During the past two decades, epileptic seizure...",1,"[past, two, decades, epileptic, seizure, detec..."
2,Lymph node metastasis is one of the most impor...,1,"[lymph, node, metastasis, one, important, indi..."
3,In order to assess the effectiveness of the Ch...,0,"[order, assess, effectiveness, chinese, govern..."
4,Convolutional Neural Networks (CNNs) have beco...,0,"[convolutional, neural, networks, cnns, become..."
5,Previously we showed the generation of a prote...,1,"[previously, showed, generation, protein, trap..."
6,Facial emotion recognition (FER) is a field of...,0,"[facial, emotion, recognition, fer, field, res..."
7,This paper proposes a machine learning model b...,0,"[paper, proposes, machine, learning, model, ba..."
8,Most current state-of-the-art blind image qual...,0,"[current, state, art, blind, image, quality, a..."
9,Surgical workflow recognition has numerous pot...,1,"[surgical, workflow, recognition, numerous, po..."


# Data Split

In [8]:
from sklearn.model_selection import train_test_split

X = data_df[['abstract', 'custom_tokenized']]
Y = data_df[['label']]

X_train_text, X_val_text, Y_train, Y_val = train_test_split(X, 
                                                            Y, 
                                                            test_size=0.2, 
                                                            shuffle=True, 
                                                            random_state=42
                                                           )
del X, Y

In [9]:
corpus_docs_train = X_train_text['custom_tokenized'].tolist()
corpus_docs_val = X_val_text['custom_tokenized'].tolist()

# Embeddings

### Training fresh Doc2Vec

In [10]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus_docs_train)]
model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4)

### Checking vector quality
### By seeing similar vectors

In [11]:
v1 = model.infer_vector(corpus_docs_val[0])
v1

array([ 0.26839852,  0.12887974,  0.0630647 ,  0.03264263,  0.3089021 ,
        0.01702114,  0.7274753 , -0.30822697, -0.6050099 ,  0.15227705,
        0.35476813, -0.43927732,  0.09672307, -0.52816296,  0.04128575,
        0.11355636,  0.46911326,  0.12250529, -0.70135015, -0.44843572,
       -0.5267816 ,  0.05371866,  0.23688714,  0.13769387,  0.20576397,
        0.08601218,  0.36132756, -0.1707585 , -0.11482327, -0.21785106,
       -0.15644243,  0.1730717 ,  0.17125547,  0.08009847, -0.33285233,
        0.36002773, -0.49833554, -0.36968857,  0.4414048 , -0.3518874 ,
       -0.09212486, -0.230916  , -0.52695817, -0.28784436,  0.36132815,
        0.18354204, -0.03444148, -0.34004754, -0.12779355,  0.38095567],
      dtype=float32)

In [12]:
similar_doc = model.docvecs.most_similar([v1])
similar_doc

  similar_doc = model.docvecs.most_similar([v1])


[(1864, 0.744713544845581),
 (241, 0.7161760330200195),
 (595, 0.7109609842300415),
 (946, 0.7107124328613281),
 (99, 0.7017166614532471),
 (2979, 0.6986605525016785),
 (1817, 0.697917640209198),
 (2981, 0.6946759223937988),
 (893, 0.6944105625152588),
 (1716, 0.6930670738220215)]

In [13]:
print(X_val_text.iloc[0]['abstract'], '\n')

print(X_train_text.iloc[similar_doc[0][0]]['abstract'], similar_doc[0][1], '\n')
print(X_train_text.iloc[similar_doc[1][0]]['abstract'], similar_doc[1][1], '\n')
print(X_train_text.iloc[similar_doc[2][0]]['abstract'], similar_doc[2][1], '\n')
print(X_train_text.iloc[similar_doc[3][0]]['abstract'], similar_doc[3][1], '\n')
print(X_train_text.iloc[similar_doc[4][0]]['abstract'], similar_doc[4][1], '\n')

Colonoscopy is tool of choice for preventing Colorectal Cancer, by detecting and removing polyps before they become cancerous. However, colonoscopy is hampered by the fact that endoscopists routinely miss 22-28% of polyps. While some of these missed polyps appear in the endoscopist's field of view, others are missed simply because of substandard coverage of the procedure, i.e. not all of the colon is seen. This paper attempts to rectify the problem of substandard coverage in colonoscopy through the introduction of the C2D2 (Colonoscopy Coverage Deficiency via Depth) algorithm which detects deficient coverage, and can thereby alert the endoscopist to revisit a given area. More specifically, C2D2 consists of two separate algorithms: the first performs depth estimation of the colon given an ordinary RGB video stream; while the second computes coverage given these depth estimates. Rather than compute coverage for the entire colon, our algorithm computes coverage locally, on a segment-by-se

## Fitting the trained Doc2Vec on both training and validation documents corpus

In [14]:
X_train_vec = [model.infer_vector(doc) for doc in X_train_text['custom_tokenized'].tolist()]
X_val_vec = [model.infer_vector(doc) for doc in X_val_text['custom_tokenized'].tolist()]

In [15]:
documents[3]

TaggedDocument(words=['dermoid', 'cyst', 'also', 'called', 'mature', 'teratoma', 'benign', 'tumor', 'ovary', 'derived', 'pluripotent', 'germ', 'cells', 'often', 'asymptomatic', 'however', 'expressed', 'several', 'complications', 'including', 'infection', 'adnexal', 'torsion', 'rupture', 'rarely', 'ovarian', 'dermoid', 'cysts', 'also', 'transform', 'malignant', 'degeneration', 'ruptured', 'teratoma', 'rare', 'life', 'threatening', 'complication', 'may', 'arise', 'spontaneously', 'however', 'cystic', 'rupture', 'often', 'secondary', 'surgical', 'procedures', 'ovarian', 'cystectomy', 'leading', 'acute', 'peritonitis', 'surgical', 'emergency', 'herein', 'report', 'case', 'acute', 'peritonitis', 'female', 'resulting', 'ovarian', 'dermoid', 'cyst', 'spillage', 'clinical', 'picture', 'radiological', 'imaging', 'consistent', 'ruptured', 'ovarian', 'cyst', 'leading', 'chemical', 'peritonitis', 'histopathological', 'examination', 'confirmed', 'ovarian', 'dermoid', 'cyst'], tags=[3])

# ML Training

In [16]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train_vec, Y_train)
Y_pred = logreg.predict(X_val_vec)

  y = column_or_1d(y, warn=True)


In [17]:
from sklearn.metrics import accuracy_score, f1_score

print('Validation accuracy %s' % accuracy_score(Y_val, Y_pred))
print('Validation F1 score: {}'.format(f1_score(Y_val, Y_pred, average='weighted')))

Validation accuracy 0.8391666666666666
Validation F1 score: 0.8388459599946492


In [18]:
codestop

NameError: name 'codestop' is not defined

### Transfer Learning

In [None]:
import gensim.downloader
from pprint import pprint

pprint(list(gensim.downloader.info()['models'].keys()))

In [None]:
w2v_pt_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
w2v_pt_vectors.save("word2vec.model")

In [None]:
# documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus_docs_train)]
# model_d2v_tl = w2v_pt_vectors.wv(documents, vector_size=300, window=5, min_count=1, workers=4)

In [None]:
w2v_pt_vectors['diagnosis']

In [None]:
from gensim.models.word2vec import Word2Vec
w2v_custom_model = Word2Vec(vector_size=300, min_count=1, window=5, workers=4)

In [None]:
try:
    print(w2v_custom_model.wv['diagnosis'])
except KeyError as e:
    print(e)

#### Build vocabulary

In [None]:
w2v_custom_model.build_vocab(corpus_docs_train)

### Injecting pre-trained vectors into blank custom vectors

In [None]:
vocab_custom_model = [w2v_custom_model.wv.index_to_key[i] for i in range(len(w2v_custom_model.wv))]

In [None]:
vocab_pt_model = [w2v_pt_vectors.index_to_key[i] for i in range(len(w2v_pt_vectors))]

In [None]:
for key in vocab_custom_model:
    try:
        # If a key is present in both pretrained and custom blank model
        # Adopt the vector of pretrained model
        w2v_custom_model.wv[key] = w2v_pt_vectors[key]
    except:
        # Else ignore and proceed ahead
        # Because some corpus-specific workds 'might' not be present in the pretrained vector vocabulary
        continue

In [None]:
try:
    print(w2v_custom_model.wv['diagnosis'])
except KeyError as e:
    print(e)

#### Instruct the model to update all vectors during training

In [None]:
w2v_custom_model.wv.vectors_lockf = np.ones(len(w2v_custom_model.wv))

In [None]:
try:
    print(w2v_custom_model.wv['diagnosis'])
except KeyError as e:
    print(e)

In [None]:
w2v_custom_model.train(corpus_docs_train, total_examples=len(corpus_docs_train), epochs=10)

In [None]:
try:
    print(w2v_custom_model.wv['diagnosis'])
except KeyError as e:
    print(e)

In [None]:
test_doc = corpus_docs_val[0]
test_doc = [x for x in test_doc if x in w2v_custom_model.wv]


v1 = w2v_custom_model.wv[test_doc]
v1

In [None]:
similar_doc = model.docvecs.most_similar([v1])
similar_doc

In [None]:
w2v_custom_model.wv.index_to_key[0]

In [None]:
w2v_pt_vectors.index_to_key[0]