======= For International Students ======= 
Write a program to construct dictionary of corpus EnglishDataset_Assignment1.txt. (there are 25000 IMDB movie reviews) You have to do:
 - Preprocessing: tokenization, stopword removal, remove punctuation, and stemming (simple normalization if needed).
 - Sort terms by term frequency and draw a figure to prof they follow Zipf‘s law (long-tail distribution). 
 - Rank terms by global TF-IDF.
 - Save the result as a txt file. 


# Preprocessing Data

## Import all needed packages
Please install all packages first

!!! Use `nltk.download()` to download all NLTK dataset to 'C:\nltk_data' if you don't have

Please refer to this link: https://www.nltk.org/data.html

In [1]:
import csv
import operator
import nltk
import string
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.probability import FreqDist

## Read data

In [2]:
with open('task2_trainset.csv', encoding='utf-8') as f:
    data = f.read().split('\n')
    data = data[:len(data)-1] #Clear last null row
    f.close()
    
data = [row for row in csv.reader(data, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL, skipinitialspace=True)]
data = pd.DataFrame(data[1:], columns = data[0]) #Transform to Pandas DataFrame




## Tokenization by using NLTK 

In [3]:
def tokenize_word(sentence):
    return nltk.word_tokenize(sentence)

def lowercase(words):
    return str(words).lower()

#tokenize abstract and title
data['Title_tokenized'] = [tokenize_word(lowercase(data['Title'][i])) for i in range(len(data))]
data['Abstract_tokenized'] = [tokenize_word(lowercase(data['Abstract'][i])) for i in range(len(data))]



## Normalization
1. Remove punctuation and whitespace words
2. Replace number
3. Remove stop words
4. Stemming
5. Lemmatization

In [4]:
def remove_punctuation_whitespace(words): #Remove all punctuation and whitespace characters
    pun_and_ws = string.punctuation+string.whitespace
    new_words = [''.join(c for c in w if c not in pun_and_ws) for w in words]
    new_words = [w for w in new_words if w != '']
    return new_words

def replace_number(words): #Converse number to text. Eg: '1' to 'one'
    p = inflect.engine()
    new_words = [p.number_to_words(w) if w.isdigit() else w for w in words]
    return new_words

def remove_stopwords(words): #Remove stopwords by using nltk.corpus.stopwords
    stop_words = stopwords.words('english')
    new_words = [w for w in words if w not in stop_words]
    return new_words

# I decided to ignore stemming step because it caused a lot of bugs like: 'comedy' -> 'comedi'

def stem_words(words): #Porter seems to be better than Lancaster
    stemmer = LancasterStemmer()
    new_words = [stemmer.stem(w) for w in words]
    return new_words

def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    new_words = [lemmatizer.lemmatize(w) for w in words]
    return new_words

def normalize(data):   
    #Remove punctuations and whitespaces
    data = [remove_punctuation_whitespace(words) for words in data]
    #data = [remove_stopwords(words) for words in data]
    """
    words = [replace_number(w) for w in words]
    
    # words = [stem_words(w) for w in words] 
    # I decided to ignore stemming step because it caused a lot of bugs like: 'comedy' -> 'comedi'
    words = [lemmatize_verbs(w) for w in words]
    """
    return data

if __name__ == "__main__":
    data['Title_tokenized'] = normalize(data['Title_tokenized'])
    data['Abstract_tokenized'] = normalize(data['Abstract_tokenized'])


## Word Embedding

In [5]:
import logging
from gensim.models import word2vec

if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = word2vec.Word2Vec(data['Abstract_tokenized'], size=300, workers=12)
    model.save('WE_CBOW.model')

2019-10-27 22:45:38,712 : INFO : collecting all words and their counts
2019-10-27 22:45:38,715 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-10-27 22:45:38,904 : INFO : collected 35641 word types from a corpus of 1076212 raw words and 7000 sentences
2019-10-27 22:45:38,905 : INFO : Loading a fresh vocabulary
2019-10-27 22:45:38,929 : INFO : min_count=5 retains 10404 unique words (29% of original 35641, drops 25237)
2019-10-27 22:45:38,929 : INFO : min_count=5 leaves 1035311 word corpus (96% of original 1076212, drops 40901)
2019-10-27 22:45:38,962 : INFO : deleting the raw counts dictionary of 35641 items
2019-10-27 22:45:38,963 : INFO : sample=0.001 downsamples 31 most-common words
2019-10-27 22:45:38,964 : INFO : downsampling leaves estimated 789566 word corpus (76.3% of prior 1035311)
2019-10-27 22:45:38,989 : INFO : estimated required memory for 10404 words and 300 dimensions: 30171600 bytes
2019-10-27 22:45:38,989 : INFO : resetting layer weights


2019-10-27 22:45:42,028 : INFO : saved WE_CBOW.model


In [6]:
set(data['Task 2'])

{'EMPIRICAL',
 'ENGINEERING',
 'ENGINEERING EMPIRICAL',
 'OTHERS',
 'THEORETICAL',
 'THEORETICAL EMPIRICAL',
 'THEORETICAL ENGINEERING',
 'THEORETICAL ENGINEERING EMPIRICAL'}

In [7]:
'THEORETICAL' in data['Task 2'][0]

True

In [8]:
data['EMPIRICAL'] = [1 if 'EMPIRICAL' in data['Task 2'][i] else 0 for i in range(len(data))]
data['ENGINEERING'] = [1 if 'ENGINEERING' in data['Task 2'][i] else 0 for i in range(len(data))]
data['THEORETICAL'] = [1 if 'THEORETICAL' in data['Task 2'][i] else 0 for i in range(len(data))]
data['OTHERS'] = [1 if 'OTHERS' in data['Task 2'][i] else 0 for i in range(len(data))]

In [9]:
word_vectors = model.wv
vocab = model.wv.vocab.keys()

In [None]:
print('Processing text dataset')
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
from string import punctuation, ascii_lowercase
import regex as re
from tqdm import tqdm

# replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                    .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                    re.MULTILINE|re.UNICODE)
# replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

# setup tokenizer
tokenizer = WordPunctTokenizer()

vocab = Counter()

def text_to_wordlist(text, lower=False):
    # replace URLs
    text = re_url.sub("URL", text)
    
    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    
    # Tokenize
    text = tokenizer.tokenize(text)
    
    # optional: lower case
    if lower:
        text = [t.lower() for t in text]
    
    # Return a list of words
    vocab.update(text)
    return text

def process_comments(list_sentences, lower=False):
    comments = []
    for text in tqdm(list_sentences):
        txt = text_to_wordlist(text, lower=lower)
        comments.append(txt)
    return comments


list_sentences_train = list(train_df["comment_text"].fillna("NAN_WORD").values)
list_sentences_test = list(test_df["comment_text"].fillna("NAN_WORD").values)

comments = process_comments(list_sentences_train + list_sentences_test, lower=True)

In [10]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

from keras.preprocessing.sequence import pad_sequences

word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in comment]
             for comment in comments[:len(list_sentences_train)]]
test_sequences = [[word_index.get(t, 0)  for t in comment] 
                  for comment in comments[len(list_sentences_train):]]

# pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)

Using TensorFlow backend.


AttributeError: 'dict_keys' object has no attribute 'most_common'

In [None]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass        

# SETUP THE COMMENT CLASSIFIER

In [None]:
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(1, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

In [None]:
hist = model.fit([data_new], y, validation_split=0.1,epochs=10, batch_size=256, shuffle=True)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")

history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with pretrained word vectors");
plt.show();

In [None]:
vocab.most_common(20)

In [None]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     # weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(1, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds, metrics='accuracy')
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

In [None]:
hist = model.fit([data_new], y, validation_split=0.1,
                 epochs=10, batch_size=256, shuffle=True)

In [None]:
history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with random word vectors");
plt.show();

In [None]:
a = [1,2,3,4,5,6]
print(a[:2])
print(a[:-2])