# Neural Network Approach for Document Categorization

use bi-directional LSTM with self attention

In [13]:
import re 
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Gensim
import gensim
# # pytorch 
import torch
import torch.nn as nn
import torch.optim as optim

[nltk_data] Downloading package stopwords to /home/sxia1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## given a corpus clean each text in the collection

In [20]:
#helper function for lemmatization
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n' #if not belong to any, default is noun, inclunding N

def lemmatize_with_pos(abstract_toekenized):
    abstract_tagged = nltk.pos_tag(abstract_toekenized)
    tags = list(map(lambda token: get_wordnet_pos(token[1]),abstract_tagged))
    abstract_lemmatized = list(map(lemmatizer.lemmatize,abstract_toekenized,tags))
    return abstract_lemmatized


#import nltk lemmatizer
lemmatizer = WordNetLemmatizer()

def prepare_token(text_tokenized_list,bigram):
    # remove stopwords and puntuation, 
    text_stopwords_removed = list(map(lambda abstract: list(filter(lambda word: word not in stop_words, abstract)),text_tokenized_list))

    #apply biagram model
    word_and_bigram_list = list(map(lambda abstract_cleaned: bigram[abstract_cleaned],text_stopwords_removed))

    # lemmatize
    abstract_lemmatized = list(map(lemmatize_with_pos,word_and_bigram_list))
    return abstract_lemmatized

def tokenize_prepare(df,update_bigram = False,**kwargs):
    # remove puncutations and number, then tokenize each text
    text_tokenized = list(map(lambda abstract_i:nltk.word_tokenize(re.sub(r'[^A-Za-z\s]','',abstract_i.lower())), df.Abstract))
    len(text_tokenized)
    df['n_words']=list(map(lambda x:len(x),text_tokenized))

    # Creating Bigram: find words frequently occur together
    if update_bigram:
        bigram = gensim.models.Phrases(text_tokenized, min_count=20, threshold=50) 
    else:
        bigram = kwargs.get('bigram',None) 
    #come back to adjust the threshold value: (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold
    # bigram_freeze = bigram.freeze()
    abstract_lemmatized = prepare_token(text_tokenized,bigram)
    df['n_words_removed']=list(map(lambda x:len(x),abstract_lemmatized))
    # plot data
    fig,axs = plt.subplots(1,4,figsize=(25,5))
    categories = df.Domain.value_counts().index
    counts = df.Domain.value_counts().values
    axs[0].bar(categories, counts, width=0.5)
    axs[0].set_title("Domain Frquency")
    categories = df.area.value_counts().index
    counts = df.area.value_counts().values
    axs[1].bar(categories, counts, width=0.5)
    axs[1].set_title("Area Frquency")
    axs[2].hist(df.n_words)
    axs[2].set_title("abstract word count")
    axs[3].hist(df.n_words_removed,bins=30)
    axs[3].set_title("abstract without stop words word count")

    plt.show()
    return abstract_lemmatized,df,bigram



In [4]:
stop_words = stopwords.words('english')
# read-in pre-labled research apper abstracts
df_paper_raw = pd.read_excel("data/WebOfScienceData.xlsx",sheet_name="abstracts")
print(len(df_paper_raw))
df_paper = df_paper_raw.head(2000) #use first 2000
print(df_paper.Domain.unique())
topic2num = {topic:i for i,topic in enumerate(df.Domain.unique())}
print(topic2num)
df_paper['Domain_No'] = df_paper.Domain.map(topic2num)

abstract_lemmatized, df,bigram = tokenize_prepare(df_paper[:10],update_bigram=True)

topic2num = {topic:i for i,topic in enumerate(df.Domain.unique())}
print(topic2num)
df['Domain_No'] = df.Domain.map(topic2num)
df.head()

46985
['CS ' 'Medical ' 'Civil ' 'ECE ' 'biochemistry ' 'MAE ' 'Psychology  ']


In [None]:
# remove 

## Building our Nerual Network
1. input to the cell:<br>
    embedding: word2vec, since the vocabsize is about 20,000, not a good idea to use one-hot vector

In [38]:
class myBi_LSTM():
    def __init__(self, vocab_size, embedding_dim,hidden_dim,**kwargs):

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.net = nn.Sequential(
            nn.Embedding(vocab_size,self.embedding_dim),
            nn.LSTM(embedding_dim, self.hidden_dim,bidirectional=True),
            nn.Sigmoid()
        )

        self.loss = None # loss function

        super().__init__(**kwargs)

    def forward(self):
        

{'CS ': 0, 'Medical ': 1, 'Civil ': 2, 'ECE ': 3, 'biochemistry ': 4, 'MAE ': 5, 'Psychology  ': 6}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Domain_No'] = df.Domain.map(topic2num)


Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,n_words,n_words_removed,Domain_No
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...,243,137,0
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...,261,182,1
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...,125,77,2
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...,143,92,3
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema...",206,130,1
