In [1]:
!pip install googletrans

from googletrans import Translator

from dask import bag, diagnostics

import numpy as np 

import pandas as pd 

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import random

import re

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 456 kB/s 
[?25hCollecting hstspreload
  Downloading hstspreload-2020.8.18-py3-none-any.whl (938 kB)
[K     |████████████████████████████████| 938 kB 1.1 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.1.0-py3-none-any.whl (4.5 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 520 kB/s 
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.4.0-py2.py3-none-any.whl (31 kB)
Collecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 953 kB/s 
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 1.3 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Down

In [2]:
train_data = pd.read_csv('/kaggle/input/researchtopictags/train.csv')

test_data = pd.read_csv('/kaggle/input/researchtopictags/test.csv') 

In [3]:
def applyRegexps(text, listRegExp):
    
    """ Applies successively many regexps to a text"""
    
    # apply all the rules in the ruleset
    
    for element in listRegExp:
        
        left = element['left']
        
        right = element['right']
        
        r=re.compile(left)
        
        text=r.sub(right,text)
    
    return text


def detex(latexText):
    
    """Transform a latex text into a simple text"""    
    # initialization
    
    regexps=[]
    
    text=latexText
    
    # remove all the contents of the header, ie everything before the first occurence of "\begin{document}"
    
    text = re.sub(r"(?s).*?(\\begin\{document\})", "", text, 1)
    
    # remove comments
    
    regexps.append({r'left':r'([^\\\d])%.*', 'right':r'\1'})
    
    text= applyRegexps(text, regexps)
    
    regexps=[]
     
    # - replace some LaTeX commands by the contents inside curly rackets
    
    to_reduce = [r'\\emph', r'\\textbf', r'\\textit', r'\\text', r'\\IEEEauthorblockA', r'\\IEEEauthorblockN', r'\\author', r'\\caption',r'\\author',r'\\thanks']
    
    for tag in to_reduce:
        
        regexps.append({'left':tag+r'\{([^\}\{]*)\}', 'right':r'\1'})
    
    text= applyRegexps(text, regexps)
    
    regexps=[]
  
    # - replace some LaTeX commands by the contents inside curly brackets and highlight these contents
    
    to_highlight = [r'\\part[\*]*', r'\\chapter[\*]*', r'\\section[\*]*', r'\\subsection[\*]*', r'\\subsubsection[\*]*', r'\\paragraph[\*]*'];
    
    # highlightment pattern: #--content--#
    
    for tag in to_highlight:
        
        regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'\n#--\1--#\n'})
    
    # highlightment pattern: [content]
    
    to_highlight = [r'\\title',r'\\author',r'\\thanks',r'\\cite', r'\\ref'];
    
    for tag in to_highlight:
        
        regexps.append({'left':tag+r'\{([^\}\{]*)\}','right':r'[\1]'})
    
    text= applyRegexps(text, regexps)
    
    regexps=[]
    
    
    # remove LaTeX tags
    # - remove completely some LaTeX commands that take arguments
    to_remove = [r'\\maketitle',r'\\footnote', r'\\centering', r'\\IEEEpeerreviewmaketitle', r'\\includegraphics', r'\\IEEEauthorrefmark', r'\\label', r'\\begin', r'\\end', r'\\big', r'\\right', r'\\left', r'\\documentclass', r'\\usepackage', r'\\bibliographystyle', r'\\bibliography',  r'\\cline', r'\\multicolumn']
    
    # replace tag with options and argument by a single space
    
    for tag in to_remove:
        
        regexps.append({'left':tag+r'(\[[^\]]*\])*(\{[^\}\{]*\})*', 'right':r' '})
    
    text= applyRegexps(text, regexps)
    
    regexps=[]

    
    
    # - replace some LaTeX commands by the contents inside curly rackets
    # replace some symbols by their ascii equivalent
    # - common symbols
    
    regexps.append({'left':r'\\eg(\{\})* *','right':r'e.g., '})
    
    regexps.append({'left':r'\\ldots','right':r'...'})
    
    regexps.append({'left':r'\\Rightarrow','right':r'=>'})
    
    regexps.append({'left':r'\\rightarrow','right':r'->'})
    
    regexps.append({'left':r'\\le','right':r'<='})
    
    regexps.append({'left':r'\\ge','right':r'>'})
    
    regexps.append({'left':r'\\_','right':r'_'})
    
    regexps.append({'left':r'\\\\','right':r'\n'})
    
    regexps.append({'left':r'~','right':r' '})
    
    regexps.append({'left':r'\\&','right':r'&'})
    
    regexps.append({'left':r'\\%','right':r'%'})
    
    regexps.append({'left':r'([^\\])&','right':r'\1\t'})
    
    regexps.append({'left':r'\\item','right':r'\t- '})
    
    regexps.append({'left':r'\\hline[ \t]*\\hline','right':r'============================================='})
    
    regexps.append({'left':r'[ \t]*\\hline','right':r'_____________________________________________'})
    
    # - special letters
    
    regexps.append({'left':r'\\\'{?\{e\}}?','right':r'é'})
    
    regexps.append({'left':r'\\`{?\{a\}}?','right':r'à'})
    
    regexps.append({'left':r'\\\'{?\{o\}}?','right':r'ó'})
    
    regexps.append({'left':r'\\\'{?\{a\}}?','right':r'á'})
    
    # keep untouched the contents of the equations
    
    regexps.append({'left':r'\$(.)\$', 'right':r'\1'})
    
    regexps.append({'left':r'\$([^\$]*)\$', 'right':r'\1'})
    
    # remove the equation symbols ($)
    
    regexps.append({'left':r'([^\\])\$', 'right':r'\1'})
    
    # correct spacing problems
    
    regexps.append({'left':r' +,','right':r','})
    
    regexps.append({'left':r' +','right':r' '})
    
    regexps.append({'left':r' +\)','right':r'\)'})
    
    regexps.append({'left':r'\( +','right':r'\('})
    
    regexps.append({'left':r' +\.','right':r'\.'})    
    
    # remove lonely curly brackets    
    
    regexps.append({'left':r'^([^\{]*)\}', 'right':r'\1'})
    
    regexps.append({'left':r'([^\\])\{([^\}]*)\}','right':r'\1\2'})
    
    regexps.append({'left':r'\\\{','right':r'\{'})
    
    regexps.append({'left':r'\\\}','right':r'\}'})
    
    # strip white space characters at end of line
    
    regexps.append({'left':r'[ \t]*\n','right':r'\n'})
    
    # remove consecutive blank lines
    
    regexps.append({'left':r'([ \t]*\n){3,}','right':r'\n'})
    
    # apply all those regexps
    
    text= applyRegexps(text, regexps)
    
    regexps=[]    
    
    # return the modified text
    
    return text

In [4]:
%%time
train_data['TITLE'] = train_data['TITLE'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

train_data['ABSTRACT'] = train_data['ABSTRACT'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

CPU times: user 9min 47s, sys: 278 ms, total: 9min 48s
Wall time: 9min 48s


In [5]:
%%time
test_data['TITLE'] = test_data['TITLE'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

test_data['ABSTRACT'] = test_data['ABSTRACT'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

CPU times: user 3min 49s, sys: 106 ms, total: 3min 49s
Wall time: 3min 49s


In [6]:
train_data.to_csv('train_without_latex_tags.csv',index = False)

test_data.to_csv('test_without_latex_tags.csv',index = False)

In [7]:
train_data = pd.read_csv('/kaggle/input/researchtopictags/train.csv')

test_data = pd.read_csv('/kaggle/input/researchtopictags/test.csv') 

In [8]:
def translate(words, dest):
    
    dest_choices = ['it',
                    'fr',
                    'es',
                    'de',
                    ]
    
    if not dest:
        
        dest = np.random.choice(dest_choices)
        
    translator = Translator()
    
    decoded = translator.translate(words, dest=dest).text
    
    return decoded


def trans_parallel(df, dest):
    
    title_bag = bag.from_sequence(df.TITLE.tolist()).map(translate, dest)
    
    abstract_bag =  bag.from_sequence(df.ABSTRACT.tolist()).map(translate, dest)
    
    with diagnostics.ProgressBar():
        
        titles = title_bag.compute()
        
        abstracts = abstract_bag.compute()
    
    df[['TITLE', 'ABSTRACT']] = list(zip(titles, abstracts))
    
    return df

    
encode_train = train_data.copy().pipe(trans_parallel, dest=None)

decode_train =  encode_train.pipe(trans_parallel, dest='en')

encode_test = test_data.copy().pipe(trans_parallel, dest=None)

decode_test =  encode_test.pipe(trans_parallel, dest='en')


[########################################] | 100% Completed | 29min 50.5s
[########################################] | 100% Completed | 31min 19.4s
[########################################] | 100% Completed | 28min 53.1s
[########################################] | 100% Completed | 30min 18.2s
[########################################] | 100% Completed | 12min 18.7s
[########################################] | 100% Completed | 13min 21.1s
[########################################] | 100% Completed | 12min 19.2s
[########################################] | 100% Completed | 13min 11.6s


In [9]:
decode_train.to_csv('train_aug_with_latex_tags.csv',index = False)

decode_test.to_csv('test_aug_with_latex_tags.csv',index = False)

In [10]:
%%time
decode_train['TITLE'] = decode_train['TITLE'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

decode_train['ABSTRACT'] = decode_train['ABSTRACT'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

CPU times: user 10min, sys: 0 ns, total: 10min
Wall time: 10min


In [11]:
%%time
decode_test['TITLE'] = decode_test['TITLE'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

decode_test['ABSTRACT'] = decode_test['ABSTRACT'].apply(lambda x :detex(x).replace("\n", " ").replace("\\", " "))

CPU times: user 3min 56s, sys: 0 ns, total: 3min 56s
Wall time: 3min 56s


In [12]:
decode_train.to_csv('train_aug_without_latex_tags.csv',index = False)

decode_test.to_csv('test_aug_without_latex_tags.csv',index = False)