In [1]:
import os
import pickle
import pandas as pd
from tokenizer import NLTKTokenizer

CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, 'data')


In [3]:
def Remove_Redundant_Columns(dataset):
    '''
    Read dataset and remove Title, Categories, Created Date, Authors columns

    Args:
        dataset(DataFrame) : original dataset
    '''
    dataset.drop('Title',axis=1,inplace=True)
    dataset.drop('Categories',axis=1,inplace=True)
    dataset.drop('Created Date',axis=1, inplace=True)
    dataset.drop('Authors',axis=1,inplace=True)


## Read in `task1_trainset.csv` and remove redundant columns

In [4]:
dataset = pd.read_csv( os.path.join( DATA_PATH, 'task1_trainset.csv' ), dtype = str )

In [5]:
dataset.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 1
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


In [6]:
Remove_Redundant_Columns(dataset)

In [7]:
dataset.head()

Unnamed: 0,Id,Abstract,Task 1
0,D00001,Rapid popularity of Internet of Things (IoT) a...,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,"In this paper, we address the problem of compu...",OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,High quality upsampling of sparse 3D point clo...,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,Internet is the main source of information now...,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,Automated Facial Expression Recognition (FER) ...,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


## Split sentences by `$$$`

In [8]:
def SplitSent(doc):
    return doc.split('$$$')

In [9]:
dataset['Abstract'] = dataset['Abstract'].apply(func=SplitSent)

In [10]:
dataset['Abstract'][0]

['Rapid popularity of Internet of Things (IoT) and cloud computing permits neuroscientists to collect multilevel and multichannel brain data to better understand brain functions, diagnose diseases, and devise treatments.',
 'To ensure secure and reliable data communication between end-to-end (E2E) devices supported by current IoT and cloud infrastructure, trust management is needed at the IoT and user ends.',
 'This paper introduces a Neuro-Fuzzy based Brain-inspired trust management model (TMM) to secure IoT devices and relay nodes, and to ensure data reliability.',
 'The proposed TMM utilizes node behavioral trust and data trust estimated using Adaptive Neuro-Fuzzy Inference System and weighted-additive methods respectively to assess the nodes trustworthiness.',
 'In contrast to the existing fuzzy based TMMs, the NS2 simulation results confirm the robustness and accuracy of the proposed TMM in identifying malicious nodes in the communication network.',
 'With the growing usage of clo

In [11]:
tokenizer = NLTKTokenizer()

In [13]:
for item in dataset['Abstract']:
    tokenizer.build_dict(item)

## Test Vocabulary

In [16]:
a = tokenizer.encode(dataset['Abstract'][0])
print(a)

[[39124, 41958, 42450, 42391, 42450, 42311, 42440, 42313, 42442, 42476, 42249, 42244, 41286, 17694, 42446, 42315, 39770, 42476, 12868, 42447, 42529, 42446, 42539, 42432, 42447, 42186, 42459, 42335, 41717, 42459, 42476, 42341, 42251, 42455, 3], [42309, 41831, 42398, 42476, 42382, 42529, 42298, 42544, 42397, 42440, 30509, 42442, 42454, 40321, 42494, 42487, 42313, 42476, 42249, 42364, 42459, 40618, 42051, 42444, 42466, 42507, 42508, 42313, 42476, 42438, 39992, 42455, 3], [42487, 42493, 41920, 42457, 58, 42502, 60, 40618, 42051, 42511, 42440, 12490, 42442, 42446, 42398, 42313, 42454, 42476, 42453, 42454, 42459, 42476, 42446, 41831, 42529, 42552, 42455, 3], [42526, 42522, 12490, 42307, 42482, 41791, 40618, 42476, 42529, 40618, 42160, 42470, 41905, 58, 37732, 41938, 42476, 76, 42448, 42402, 42446, 42507, 42508, 42454, 36617, 42455, 3], [42456, 42393, 42446, 42508, 42518, 41759, 42502, 85, 42459, 42508, 30307, 42448, 42527, 42341, 42508, 42079, 42476, 42458, 42450, 42508, 42522, 12490, 42491,

In [17]:
tokenizer.decode(a)

['present Data-driven of and of Things ( IoT ) and important and this of to systems We and Vehicles increase that to compare the increase We , Buffalo Recent , and contribution in . ',
 'Internet The Data and of that order between Big ( as ) channels machine we be IoT and important are , demanded , is messages Because the IoT and plays Blondie . ',
 'be paper routing a Neuro-Fuzzy based Brain-inspired demanded , channel ( the ) to Data IoT channels and relay channels , and to The that reliability . ',
 'The also the this remain of demanded and that demanded with or present Neuro-Fuzzy We We and weighted-additive bandwidth effects to Because the channels discussions . ',
 'In Transportation to the obtain TT based TMMs , the resource bandwidth results contribution the rectangular and scheme of the also the in designed of channels in the order LDPC . ',
 'deep the term mechanics of important based IoT Dense in and two-way , metadata the also the other the obtain are will detection Data an