In [88]:
import csv
import operator
import nltk
import string
import pandas as pd
import sklearn
from tqdm import tqdm
import logging
import numpy as np
from gensim.models import word2vec, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, sent_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from numpy import asarray, zeros
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import SimpleRNN, LSTM, GRU, Embedding, Dense, Dropout, CuDNNGRU, CuDNNLSTM, Bidirectional, Convolution1D, concatenate
from keras.layers import Input, Conv1D, GlobalMaxPooling1D, Flatten, Activation, SpatialDropout1D, BatchNormalization, MaxPool1D, MaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [91]:
def read_file(filename):
    with open(r'../'+filename, encoding='utf-8') as f:
        data = f.read().split('\n')
        data = data[:len(data)-1] #Clear last null row
        f.close()
    data = [row for row in csv.reader(data, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL, skipinitialspace=True)]
    data = pd.DataFrame(data[1:], columns = data[0]) #Transform to Pandas DataFrame
    return data

def preprocess(text):
    STOPWORDS = set(stopwords.words("english"))
    text= text.strip().lower().split(' ')
    text = filter(lambda word: word not in STOPWORDS, text)
    return " ".join(text)

def categorical_label(df):
    df['EMPIRICAL'] = [1 if 'EMPIRICAL' in df.loc[i, 'Task 2'] else 0 for i in range(len(df))]
    df['ENGINEERING'] = [1 if 'ENGINEERING' in df.loc[i, 'Task 2'] else 0 for i in range(len(df))]
    df['THEORETICAL'] = [1 if 'THEORETICAL' in df.loc[i, 'Task 2'] else 0 for i in range(len(df))]
    df['OTHERS'] = [1 if 'OTHERS' in df.loc[i, 'Task 2'] else 0 for i in range(len(df))]
    
def tokenize_title():
    titles = pd.concat([train_df['Title'],test_df['Title']], ignore_index=True)
    
    #Using keras tokenizer to encode and decode 
    t = Tokenizer()
    t.fit_on_texts(titles)
    new_titles = [text_to_word_sequence(sen) for sen in titles] #new tokenizered titles
    
    #Texts to sequences
    train_titles_encoded = t.texts_to_sequences(train_df['Title'])
    test_titles_encoded = t.texts_to_sequences(test_df['Title'])
    
    #pad sequences
    max_title_len = max([len(sen) for sen in new_titles]) #max length of abstract
    train_titles_sequences = pad_sequences(train_titles_encoded, maxlen=max_title_len)
    test_titles_sequences = pad_sequences(test_titles_encoded, maxlen=max_title_len)
    
    #get word index and vocab size
    title_word_index = t.word_index
    title_vocab_size = len(title_word_index)+1
    
    #Print overview
    print('Title tokenized: \n{}\n'.format(new_titles[0]))
    print('Train titles sequences feed to Embedding layer: \n{}\n'.format(train_titles_sequences[0]))
    print('Max length of titles: {}\n'.format(max_title_len))
    print('Sample of word index: \n{}\n'.format(list(title_word_index.items())[:5]))
    print('Vocabulary size: ', title_vocab_size)
    
    return new_titles, train_titles_sequences, test_titles_sequences, max_title_len, title_word_index, title_vocab_size

def tokenize_abstract():
    abstracts = pd.concat([train_df['Abstract'],test_df['Abstract']], ignore_index=True)
    
    #Using keras tokenizer to encode and decode 
    t = Tokenizer()
    t.fit_on_texts(abstracts)
    new_abstracts = [text_to_word_sequence(sen) for sen in abstracts] #new tokenizered abtracts
    
    #Texts to sequences
    train_abstracts_encoded = t.texts_to_sequences(train_df['Abstract'])
    test_abstracts_encoded = t.texts_to_sequences(test_df['Abstract'])
    
    #pad sequences
    max_abstract_len = max([len(sen) for sen in new_abstracts]) #max length of abstract
    train_abstracts_sequences = pad_sequences(train_abstracts_encoded, maxlen=max_abstract_len)
    test_abstracts_sequences = pad_sequences(test_abstracts_encoded, maxlen=max_abstract_len)
    
    #get word index and vocab size
    abstract_word_index = t.word_index
    abstract_vocab_size = len(abstract_word_index)+1
    
    #Print overview
    print('#############################################################\n\n')
    print('Abstract tokenized: \n{}\n'.format(new_abstracts[0]))
    print('Train Abstract sequences feed to Embedding layer: \n{}\n'.format(train_abstracts_sequences[0]))
    print('Max length of abtracts: {}\n'.format(max_abstract_len))
    print('Sample of Abstract word index: \n{}\n'.format(list(abstract_word_index.items())[:5]))
    print('Abstract vocabulary size: ', abstract_vocab_size)
    
    return new_abstracts, train_abstracts_sequences, test_abstracts_sequences, max_abstract_len, abstract_word_index, abstract_vocab_size

if __name__ == "__main__":
    #read file
    train_df = read_file('task2_trainset.csv')
    test_df = read_file('task2_public_testset.csv')
    categorical_label(train_df) #Categorical label to multiple columns
    
    """
    #Use this if want to remove stopwords in Abstract
    train_df['Abstract'] = train_df['Abstract'].apply(preprocess)
    test_df['Abstract'] = test_df['Abstract'].apply(preprocess)
    """
    
    """
    
    
    new_titles, train_titles_sequences,\
    test_titles_sequences, max_title_len,\
    title_word_index, title_vocab_size = tokenize_title()
    
    new_abstracts, train_abstracts_sequences,\
    test_abstracts_sequences, max_abstract_len,\
    abstract_word_index, abstract_vocab_size = tokenize_abstract()
    """

In [95]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#le = LabelEncoder()

df_bert = pd.DataFrame({'id': train_df['Id'],
                           'label': train_df['EMPIRICAL'],
                           'alpha': ['a']*len(train_df),
                           'text': train_df['Abstract'].replace('\$', '', regex=True) 
                          })
 
#df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)
 
# Creating test dataframe according to BERT
df_test = test_df
df_bert_test = pd.DataFrame({'order_id':df_test['Id'],
                 'text':df_test['Abstract'].replace('\$', '', regex=True)})
 
# Saving dataframes to .tsv format as required by BERT
df_bert.to_csv('data/train.tsv', sep='\t', index=False, header=False)
#df_bert_dev.to_csv('data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('data/test.tsv', sep='\t', index=False, header=True)

In [110]:
bert_theoretical = pd.read_csv('./output/bert_theoretical.tsv', sep='\t', header=None)
bert_engineering = pd.read_csv('./output/bert_engineering.tsv', sep='\t', header=None)
bert_empirical = pd.read_csv('./output/bert_empirical.tsv', sep='\t', header=None)

df_test = pd.DataFrame({
    'order_id': test_df['Id'],
    'THEORETICAL': bert_theoretical.idxmax(axis=1),
    'ENGINEERING': bert_engineering.idxmax(axis=1),
    'EMPIRICAL': bert_empirical.idxmax(axis=1),
    'OTHERS': [0]*test_df.shape[0]
})

for i in range(df_test.shape[0]):
    if df_test.loc[i, 'THEORETICAL'] == 0 and df_test.loc[i, 'ENGINEERING'] == 0 and df_test.loc[i, 'EMPIRICAL'] == 0:
        df_test.loc[i, 'OTHERS'] == 1
        
df_test

df_test.to_csv('submission.csv', index=False, header=True)

In [109]:
sum(df_test['ENGINEERING'])

10769