In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import gc
tqdm.pandas()


def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float16')[:1]
    
    if file == '../input/quoratextemb/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    elif file == '../input/quoratextemb/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin':
        embeddings_index = KeyedVectors.load_word2vec_format(file, binary=True)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def build_vocab(texts):
    sentences = texts.progress_apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        if word in embeddings_index:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        elif word.capitalize() in embeddings_index:
            known_words[word] = embeddings_index[word.capitalize()]
            nb_known_words += vocab[word]
        elif word.lower() in embeddings_index:
            known_words[word] = embeddings_index[word.lower()]
            nb_known_words += vocab[word]
        elif word.upper() in embeddings_index:
            known_words[word] = embeddings_index[word.upper()]
            nb_known_words += vocab[word]
        else:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

def vocab_check_coverage(train, test):
    df = pd.concat([train, test]).reset_index(drop=True)
    
    vocab = build_vocab(df['text'])
    print("Glove : ")
    oov_glove = check_coverage(vocab, embed_glove)
    oov_glove = {"oov_rate": len(oov_glove) / len(vocab), 'oov_words': oov_glove}
    print("Paragram : ")
    oov_paragram = check_coverage(vocab, embed_paragram)
    oov_paragram = {"oov_rate": len(oov_paragram) / len(vocab), 'oov_words': oov_paragram}
    print("FastText : ")
    oov_fasttext = check_coverage(vocab, embed_fasttext)
    oov_fasttext = {"oov_rate": len(oov_fasttext) / len(vocab), 'oov_words': oov_fasttext}
#     print("Google : ")
#     oov_google = check_coverage(vocab, embed_google)
#     oov_google = {"oov_rate": len(oov_google) / len(vocab), 'oov_words': oov_google}
    
    return oov_glove, oov_paragram, oov_fasttext

In [4]:
glove = '../input/quoratextemb/embeddings/glove.840B.300d/glove.840B.300d.txt'
paragram =  '../input/quoratextemb/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
wiki_news = '../input/quoratextemb/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
google_path = '../input/quoratextemb/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

print("Extracting GloVe embedding")
embed_glove = load_embed(glove)
print("Extracting Paragram embedding")
embed_paragram = load_embed(paragram)
print("Extracting FastText embedding")
embed_fasttext = load_embed(wiki_news)
# print("Extracting GoogleNews embedding")
# embed_google = load_embed(google_path)

Extracting GloVe embedding
Extracting Paragram embedding
Extracting FastText embedding


In [5]:
import gc
gc.collect()

11

In [6]:
train_df = pd.read_csv("../input/innoplexusav/train.csv")
test_df  = pd.read_csv("../input/innoplexusav/test.csv")

In [7]:
print(len(test_df))
print(len(train_df))

2924
5279


In [8]:
train = train_df[['drug','text']]
test = test_df[['drug','text']]

In [9]:
train['text'] = train['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
test['text'] = test['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [10]:
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

train['text'] = train['text'].str.replace(r"\(.*?\)","")
test['text'] = test['text'].str.replace(r"\(.*?\)","")

In [11]:
contraction_mapping = {
    "Trump's" : 'trump is',"'cause": 'because','â€™': "'",',cause': 'because',';cause': 'because',"ain't": 'am not','ain,t': 'am not',
    'ain;t': 'am not','ain´t': 'am not','ain’t': 'am not',"aren't": 'are not','â€“': '-','â€œ':'"',
    'aren,t': 'are not','aren;t': 'are not','aren´t': 'are not','aren’t': 'are not',"can't": 'cannot',"can't've": 'cannot have','can,t': 'cannot','can,t,ve': 'cannot have',
    'can;t': 'cannot','can;t;ve': 'cannot have',
    'can´t': 'cannot','can´t´ve': 'cannot have','can’t': 'cannot','can’t’ve': 'cannot have',
    "could've": 'could have','could,ve': 'could have','could;ve': 'could have',"couldn't": 'could not',"couldn't've": 'could not have','couldn,t': 'could not','couldn,t,ve': 'could not have','couldn;t': 'could not',
    'couldn;t;ve': 'could not have','couldn´t': 'could not',
    'couldn´t´ve': 'could not have','couldn’t': 'could not','couldn’t’ve': 'could not have','could´ve': 'could have',
    'could’ve': 'could have',"didn't": 'did not','didn,t': 'did not','didn;t': 'did not','didn´t': 'did not',
    'didn’t': 'did not',"doesn't": 'does not','doesn,t': 'does not','doesn;t': 'does not','doesn´t': 'does not',
    'doesn’t': 'does not',"don't": 'do not','don,t': 'do not','don;t': 'do not','don´t': 'do not','don’t': 'do not',
    "hadn't": 'had not',"hadn't've": 'had not have','hadn,t': 'had not','hadn,t,ve': 'had not have','hadn;t': 'had not',
    'hadn;t;ve': 'had not have','hadn´t': 'had not','hadn´t´ve': 'had not have','hadn’t': 'had not','hadn’t’ve': 'had not have',"hasn't": 'has not','hasn,t': 'has not','hasn;t': 'has not','hasn´t': 'has not','hasn’t': 'has not',
    "haven't": 'have not','haven,t': 'have not','haven;t': 'have not','haven´t': 'have not','haven’t': 'have not',"he'd": 'he would',
    "he'd've": 'he would have',"he'll": 'he will',
    "he's": 'he is','he,d': 'he would','he,d,ve': 'he would have','he,ll': 'he will','he,s': 'he is','he;d': 'he would',
    'he;d;ve': 'he would have','he;ll': 'he will','he;s': 'he is','he´d': 'he would','he´d´ve': 'he would have','he´ll': 'he will',
    'he´s': 'he is','he’d': 'he would','he’d’ve': 'he would have','he’ll': 'he will','he’s': 'he is',"how'd": 'how did',"how'll": 'how will',
    "how's": 'how is','how,d': 'how did','how,ll': 'how will','how,s': 'how is','how;d': 'how did','how;ll': 'how will',
    'how;s': 'how is','how´d': 'how did','how´ll': 'how will','how´s': 'how is','how’d': 'how did','how’ll': 'how will',
    'how’s': 'how is',"i'd": 'i would',"i'll": 'i will',"i'm": 'i am',"i've": 'i have','i,d': 'i would','i,ll': 'i will',
    'i,m': 'i am','i,ve': 'i have','i;d': 'i would','i;ll': 'i will','i;m': 'i am','i;ve': 'i have',"isn't": 'is not',
    'isn,t': 'is not','isn;t': 'is not','isn´t': 'is not','isn’t': 'is not',"it'd": 'it would',"it'll": 'it will',"It's":'it is',
    "it's": 'it is','it,d': 'it would','it,ll': 'it will','it,s': 'it is','it;d': 'it would','it;ll': 'it will','it;s': 'it is','it´d': 'it would','it´ll': 'it will','it´s': 'it is',
    'it’d': 'it would','it’ll': 'it will','it’s': 'it is',
    'i´d': 'i would','i´ll': 'i will','i´m': 'i am','i´ve': 'i have','i’d': 'i would','i’ll': 'i will','i’m': 'i am',
    'i’ve': 'i have',"let's": 'let us','let,s': 'let us','let;s': 'let us','let´s': 'let us',
    'let’s': 'let us',"ma'am": 'madam','ma,am': 'madam','ma;am': 'madam',"mayn't": 'may not','mayn,t': 'may not','mayn;t': 'may not',
    'mayn´t': 'may not','mayn’t': 'may not','ma´am': 'madam','ma’am': 'madam',"might've": 'might have','might,ve': 'might have','might;ve': 'might have',"mightn't": 'might not','mightn,t': 'might not','mightn;t': 'might not','mightn´t': 'might not',
    'mightn’t': 'might not','might´ve': 'might have','might’ve': 'might have',"must've": 'must have','must,ve': 'must have','must;ve': 'must have',
    "mustn't": 'must not','mustn,t': 'must not','mustn;t': 'must not','mustn´t': 'must not','mustn’t': 'must not','must´ve': 'must have',
    'must’ve': 'must have',"needn't": 'need not','needn,t': 'need not','needn;t': 'need not','needn´t': 'need not','needn’t': 'need not',"oughtn't": 'ought not','oughtn,t': 'ought not','oughtn;t': 'ought not',
    'oughtn´t': 'ought not','oughtn’t': 'ought not',"sha'n't": 'shall not','sha,n,t': 'shall not','sha;n;t': 'shall not',"shan't": 'shall not',
    'shan,t': 'shall not','shan;t': 'shall not','shan´t': 'shall not','shan’t': 'shall not','sha´n´t': 'shall not','sha’n’t': 'shall not',
    "she'd": 'she would',"she'll": 'she will',"she's": 'she is','she,d': 'she would','she,ll': 'she will',
    'she,s': 'she is','she;d': 'she would','she;ll': 'she will','she;s': 'she is','she´d': 'she would','she´ll': 'she will',
    'she´s': 'she is','she’d': 'she would','she’ll': 'she will','she’s': 'she is',"should've": 'should have','should,ve': 'should have','should;ve': 'should have',
    "shouldn't": 'should not','shouldn,t': 'should not','shouldn;t': 'should not','shouldn´t': 'should not','shouldn’t': 'should not','should´ve': 'should have',
    'should’ve': 'should have',"that'd": 'that would',"that's": 'that is','that,d': 'that would','that,s': 'that is','that;d': 'that would',
    'that;s': 'that is','that´d': 'that would','that´s': 'that is','that’d': 'that would','that’s': 'that is',"there'd": 'there had',
    "there's": 'there is','there,d': 'there had','there,s': 'there is','there;d': 'there had','there;s': 'there is',
    'there´d': 'there had','there´s': 'there is','there’d': 'there had','there’s': 'there is',
    "they'd": 'they would',"they'll": 'they will',"they're": 'they are',"they've": 'they have',
    'they,d': 'they would','they,ll': 'they will','they,re': 'they are','they,ve': 'they have','they;d': 'they would','they;ll': 'they will','they;re': 'they are',
    'they;ve': 'they have','they´d': 'they would','they´ll': 'they will','they´re': 'they are','they´ve': 'they have','they’d': 'they would','they’ll': 'they will',
    'they’re': 'they are','they’ve': 'they have',"wasn't": 'was not','wasn,t': 'was not','wasn;t': 'was not','wasn´t': 'was not',
    'wasn’t': 'was not',"we'd": 'we would',"we'll": 'we will',"we're": 'we are',"we've": 'we have','we,d': 'we would','we,ll': 'we will',
    'we,re': 'we are','we,ve': 'we have','we;d': 'we would','we;ll': 'we will','we;re': 'we are','we;ve': 'we have',
    "weren't": 'were not','weren,t': 'were not','weren;t': 'were not','weren´t': 'were not','weren’t': 'were not','we´d': 'we would','we´ll': 'we will',
    'we´re': 'we are','we´ve': 'we have','we’d': 'we would','we’ll': 'we will','we’re': 'we are','we’ve': 'we have',"what'll": 'what will',"what're": 'what are',"what's": 'what is',
    "what've": 'what have','what,ll': 'what will','what,re': 'what are','what,s': 'what is','what,ve': 'what have','what;ll': 'what will','what;re': 'what are',
    'what;s': 'what is','what;ve': 'what have','what´ll': 'what will',
    'what´re': 'what are','what´s': 'what is','what´ve': 'what have','what’ll': 'what will','what’re': 'what are','what’s': 'what is',
    'what’ve': 'what have',"where'd": 'where did',"where's": 'where is','where,d': 'where did','where,s': 'where is','where;d': 'where did',
    'where;s': 'where is','where´d': 'where did','where´s': 'where is','where’d': 'where did','where’s': 'where is',
    "who'll": 'who will',"who's": 'who is','who,ll': 'who will','who,s': 'who is','who;ll': 'who will','who;s': 'who is',
    'who´ll': 'who will','who´s': 'who is','who’ll': 'who will','who’s': 'who is',"won't": 'will not','won,t': 'will not','won;t': 'will not',
    'won´t': 'will not','won’t': 'will not',"wouldn't": 'would not','wouldn,t': 'would not','wouldn;t': 'would not','wouldn´t': 'would not',
    'wouldn’t': 'would not',"you'd": 'you would',"you'll": 'you will',"you're": 'you are','you,d': 'you would','you,ll': 'you will',
    'you,re': 'you are','you;d': 'you would','you;ll': 'you will',
    'you;re': 'you are','you´d': 'you would','you´ll': 'you will','you´re': 'you are','you’d': 'you would','you’ll': 'you will','you’re': 'you are',
    '´cause': 'because','’cause': 'because',"you've": "you have","could'nt": 'could not',
    "havn't": 'have not',"here’s": "here is",'i""m': 'i am',"i'am": 'i am',"i'l": "i will","i'v": 'i have',"wan't": 'want',"was'nt": "was not","who'd": "who would",
    "who're": "who are","who've": "who have","why'd": "why would","would've": "would have","y'all": "you all","y'know": "you know","you.i": "you i",
    "your'e": "you are","arn't": "are not","agains't": "against","c'mon": "common","doens't": "does not",'don""t': "do not","dosen't": "does not",
    "dosn't": "does not","shoudn't": "should not","that'll": "that will","there'll": "there will","there're": "there are",
    "this'll": "this all","u're": "you are", "ya'll": "you all","you'r": "you are","you’ve": "you have","d'int": "did not","did'nt": "did not","din't": "did not","dont't": "do not","gov't": "government",
    "i'ma": "i am","is'nt": "is not","‘I":'I',
    'ᴀɴᴅ':'and','ᴛʜᴇ':'the','ʜᴏᴍᴇ':'home','ᴜᴘ':'up','ʙʏ':'by','ᴀᴛ':'at','…and':'and','civilbeat':'civil beat',\
    'TrumpCare':'Trump care','Trumpcare':'Trump care', 'OBAMAcare':'Obama care','ᴄʜᴇᴄᴋ':'check','ғᴏʀ':'for','ᴛʜɪs':'this','ᴄᴏᴍᴘᴜᴛᴇʀ':'computer',\
    'ᴍᴏɴᴛʜ':'month','ᴡᴏʀᴋɪɴɢ':'working','ᴊᴏʙ':'job','ғʀᴏᴍ':'from','Sᴛᴀʀᴛ':'start','gubmit':'submit','CO₂':'carbon dioxide','ғɪʀsᴛ':'first',\
    'ᴇɴᴅ':'end','ᴄᴀɴ':'can','ʜᴀᴠᴇ':'have','ᴛᴏ':'to','ʟɪɴᴋ':'link','ᴏғ':'of','ʜᴏᴜʀʟʏ':'hourly','ᴡᴇᴇᴋ':'week','ᴇɴᴅ':'end','ᴇxᴛʀᴀ':'extra',\
    'Gʀᴇᴀᴛ':'great','sᴛᴜᴅᴇɴᴛs':'student','sᴛᴀʏ':'stay','ᴍᴏᴍs':'mother','ᴏʀ':'or','ᴀɴʏᴏɴᴇ':'anyone','ɴᴇᴇᴅɪɴɢ':'needing','ᴀɴ':'an','ɪɴᴄᴏᴍᴇ':'income',\
    'ʀᴇʟɪᴀʙʟᴇ':'reliable','ғɪʀsᴛ':'first','ʏᴏᴜʀ':'your','sɪɢɴɪɴɢ':'signing','ʙᴏᴛᴛᴏᴍ':'bottom','ғᴏʟʟᴏᴡɪɴɢ':'following','Mᴀᴋᴇ':'make',\
    'ᴄᴏɴɴᴇᴄᴛɪᴏɴ':'connection','ɪɴᴛᴇʀɴᴇᴛ':'internet','financialpost':'financial post', 'ʜaᴠᴇ':' have ', 'ᴄaɴ':' can ', 'Maᴋᴇ':' make ', 'ʀᴇʟɪaʙʟᴇ':' reliable ', 'ɴᴇᴇᴅ':' need ',
    'ᴏɴʟʏ':' only ', 'ᴇxᴛʀa':' extra ', 'aɴ':' an ', 'aɴʏᴏɴᴇ':' anyone ', 'sᴛaʏ':' stay ', 'Sᴛaʀᴛ':' start', 'SHOPO':'shop',
    }

In [12]:
bad_case_words = {'tkis' : 'Tyrosine kinase inhibitors','blogthis':'blog','tnfα':'tumor necrosis factor',
                  'jimc':'johannesburg international mail centre','iraes':'immune related adverse events',' ive ':' i have ',
                  'pdl1':'programmed death ligand','pd1':'programmed death ligand','5asas':'aminosalicylates',' doc ':' doctor ','🙂':'good',
                 ' cannot ': ' can not ',' spms ': ' secondary progressive multiple sclerosis ',' aes ': ' adverse events ',
                 'saes':'serious adverse events','anti-cd20':'monoclonal antibodies',' mri ': ' magnetic resonance imaging ',
                 ' re ':' are ',' dmt ': ' drug ',' jcv ': ' virus ',' chemo ': ' chemotherapy ',
                 ' 75mgs ': ' 75 miligrams ','α4β7':'integrin',' pharmgkb ': ' pharmacogenomics knowledgebase ','\x80\x99l':'"',
                ' spms ':' secondary progressive multiple sclerosis ','azd9291':'osimertinib',' mymsteam ': 'multiple sclerosis support ',
                 ' the ':' ',' an ':' ',' amultiple ':' multiple ',' nsclc ': ' non small cell lung cancer ',' meds ': ' medicines ',
                 'thepowerofpoop':'the power of poop',' asas ':' aspirin ','egfr':'estimated glomerular filtration rate',
                  ' alk ': ' lung cancer ',' kras mutation ':' mutation ',' imrt ': ' intensity modulated radiation therapy ',
                ' ms ': ' multiple sclerosis ',' c797s ':' mutation cells ',' nhs ': ' national health service',
                  'politicalspeak': 'political speak','newsspeak':'news speak','clinicaltrials':'clinical trials',
                  'stillstannding':'still standing','naturalreader':'natural reader'}

In [13]:
def correct_contraction(x, dic):
    for word in dic.keys():
        if word in x:
            x = x.replace(word, dic[word])
    return x

In [14]:
train['text'] = train['text'].progress_apply(lambda x: correct_contraction(x, contraction_mapping))
test['text']  = test['text'].progress_apply(lambda x: correct_contraction(x, contraction_mapping))

100%|██████████| 5279/5279 [00:04<00:00, 1216.88it/s]
100%|██████████| 2924/2924 [00:02<00:00, 1037.42it/s]


In [15]:
import os,operator

In [16]:
extra_punct = [
    ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

In [17]:
import string
my_punct = list(string.punctuation)
all_punct = list(set(my_punct + extra_punct))

In [18]:
#all_punct.remove('-')
#all_punct.remove('.')

In [19]:
special_punc_mappings = {"—": "-", "–": "-", "_": "-", '”': '"', "″": '"', '“': '"', '•': '.', '−': '-',
                         "’": "'", "‘": "'", "´": "'", "`": "'", '\u200b': ' ', '\xa0': ' ','،':'','„':'',
                         '…': ' ... ', '\ufeff': ''}

In [20]:
def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, f' {punc} ')
    return text

def clean_special_punctuations(text):
    for punc in special_punc_mappings:
        if punc in text:
            text = text.replace(punc, special_punc_mappings[punc])
    # remove_diacritics don´t' ->  'don t'
    #text = remove_diacritics(text)
    return text

def spacing_some_connect_words(text):
    """
    'Whyare' -> 'Why are'
    """
    ori = text
    for error in mis_spell_mapping:
        if error in text:
            text = text.replace(error, mis_spell_mapping[error])

    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)
    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", ' WhatsApp ')
    text = remove_space(text)
    
    return text

def clean_bad_case_words(text):
    for bad_word in bad_case_words:
        if bad_word in text:
            text = text.replace(bad_word, bad_case_words[bad_word])
    return text

In [21]:
def preprocess(text):
    #text = remove_space(text)
    text = clean_bad_case_words(text)
    #text = spacing_some_connect_words(text)
    text = spacing_punctuation(text)
    text = clean_special_punctuations(text)
    return text

In [22]:
train["text"] = train["text"].apply(preprocess)
test["text"] = test["text"].apply(preprocess)

In [23]:
train['text'] = train['text'].str.replace(r'\b\w\b','').str.replace(r'\s+', ' ')
test['text'] = test['text'].str.replace(r'\b\w\b','').str.replace(r'\s+', ' ')

In [24]:
train['text'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
test['text'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

In [25]:
train['text'].replace({'  ':' '}, regex=True, inplace=True)
test['text'].replace({'  ':' '}, regex=True, inplace=True)

In [26]:
print(len(test_df))
print(len(train_df))

2924
5279


In [27]:
oov_glove, oov_paragram, oov_fasttext = vocab_check_coverage(train, test)

100%|██████████| 8203/8203 [00:00<00:00, 18028.91it/s]


Glove : 
Found embeddings for 85.82% of vocab
Found embeddings for  99.06% of all text
Paragram : 
Found embeddings for 86.18% of vocab
Found embeddings for  99.08% of all text
FastText : 
Found embeddings for 82.03% of vocab
Found embeddings for  98.92% of all text


In [28]:
oov_glove['oov_words'][:50]

[('ocrevus', 1854),
 ('entyvio', 1197),
 ('keytruda', 974),
 ('opdivo', 910),
 ('tagrisso', 761),
 ('pembrolizumab', 627),
 ('nivolumab', 620),
 ('tecfidera', 540),
 ('osimertinib', 440),
 ('ros1', 402),
 ('vedolizumab', 269),
 ('alectinib', 241),
 ('durvalumab', 214),
 ('atezolizumab', 208),
 ('dabrafenib', 206),
 ('uceris', 195),
 ('trametinib', 191),
 ('siponimod', 189),
 ('inflectra', 169),
 ('proctosigmoiditis', 160),
 ('mavenclad', 156),
 ('ceritinib', 144),
 ('tecentriq', 144),
 ('upadacitinib', 116),
 ('ibdsuperheroes', 114),
 ('brigatinib', 109),
 ('dcvax', 108),
 ('imfinzi', 94),
 ('ileorectal', 86),
 ('qbtx', 79),
 ('ozanimod', 79),
 ('catdander', 78),
 ('renflexis', 76),
 ('risankizumab', 70),
 ('baf312', 67),
 ('ipoop', 65),
 ('msers', 62),
 ('mekinist', 62),
 ('tafinlar', 60),
 ('filgotinib', 59),
 ('delzicol', 56),
 ('lorlatinib', 55),
 ('rociletinib', 54),
 ('zykadia', 53),
 ('gilotrif', 52),
 ('necitumumab', 51),
 ('jpouch', 49),
 ('serviceengland', 49),
 ('actrims', 4

In [29]:
train['text'][6]

'reply posted for jesszidek . hi jess sorry to read about challenges you are having with your health . you mentioned lot in your post . just want to share some info on few of points . first , know you said that you are scared of humira . humira and other biologics are very successful in reducing symptoms and inducing and maintain disease remission . to reduce your level of fear it can help to learn more about your treatment option . you can learn more about some of your treatment options . to learn more view our understanding ibd medication brochure at : . if you would like to talk , contact help center at 888 - 694 - 8872 or at info @ crohnscolitisfoundation . org'

In [30]:
import re

In [31]:
train['text'] = train['text'].str.replace('# ','')
train['text'] = train['text'].str.replace(' - ','')
train['text'] = train['text'].str.replace(' : ','')

test['text'] = test['text'].str.replace('#','')
test['text'] = test['text'].str.replace(' - ','')
test['text'] = test['text'].str.replace(' : ','')

In [32]:
print(len(test_df))
print(len(train_df))

2924
5279


In [33]:
def splitting(data):
    new=[]
    for sentences in data:
        yes = sentences.split(". ")
        new.append(yes) 
        
    return new
    
train['new_text'] = splitting(train['text'])
test['new_text'] = splitting(test['text'])

In [34]:
print(len(test_df))
print(len(train_df))

2924
5279


In [35]:
def cleaning1(data):
    new_text=[]
    for sentences in data:
        matching = [s for s in sentences if 'reply posted' not in s]
        new_text.append(matching)
        
    return new_text

train['new_text'] = cleaning1(train['new_text'])
test['new_text'] = cleaning1(test['new_text'])

In [36]:
print(len(test_df))
print(len(train_df))

2924
5279


In [37]:
def cleaning2(data):
    new_text=[]
    for sentences in data:
        matching = [s for s in sentences if 'help center' not in s]
        new_text.append(matching)
        
    return new_text

train['new_text'] = cleaning2(train['new_text'])
test['new_text'] = cleaning2(test['new_text'])

In [38]:
print(len(test_df))
print(len(train_df))

2924
5279


In [39]:
def cleaning3(data):
    new_text=[]
    
    for sentences in data:
        if(len(sentences)>1):
            matching = [s for s in sentences if len(s) >= 15]
            new_text.append(matching)
            
        else:
            matching = [s for s in sentences if len(s) >= 2]
            new_text.append(matching)
        
    return new_text

train['new_text'] = cleaning3(train['new_text'])
test['new_text'] = cleaning3(test['new_text'])

In [40]:
print(len(test_df))
print(len(train_df))

2924
5279


In [41]:
def don(d):
    n=[]
    for s in d:
        res = ".".join(s)
        n.append(res)
    return n       

In [42]:
train['text'] = don(train['new_text'])
test['text'] = don(test['new_text'])

In [43]:
train.drop(['drug','new_text'],axis=1,inplace=True)
test.drop(['drug','new_text'],axis=1,inplace=True)

In [44]:
print(len(test_df))
print(len(train_df))

2924
5279


In [45]:
#train["drug"] = train["drug"].apply(preprocess)
#drug_list = train['drug'].to_dict()

In [46]:
train_df.drop(['text'],axis=1,inplace=True)
test_df.drop(['text'],axis=1,inplace=True)

In [47]:
train = pd.merge(train_df,train,left_index=True, right_index=True,how='inner')
test = pd.merge(test_df,test,left_index=True, right_index=True,how='inner')

In [48]:
train.to_csv('train_2.csv',index=False)
test.to_csv('test_2.csv',index=False)