In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import copy

from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils


stop_words = stopwords.words('english')

em_dict = {}
f = open('glove.840B.300d.txt', errors ='ignore', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    w = ''.join(values[:-300])
    em_dict[w] = np.asarray(values[-300:], dtype=np.float32)
f.close()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
1859885it [02:18, 11250.41it/s]

In [None]:
import pickle

def break_hashtag(text):
    text_words = re.split(r'(#\w+)', text)
    texts = []
    for text_word in text_words:
        if re.match(r'#\w+', text_word):
            words = []
            i = 1
            word = ''
            while i < len(text_word):
                if text_word[i].isupper():
                    words.append(word)
                    word = text_word[i]
                else:
                    word += text_word[i]
                i += 1
            words.append(word)
            texts.append(' '.join(words).strip())
        else:
            texts.append(text_word)

    return ' '.join(texts)

def tokenize(text, with_stopwords=True):
    text = break_hashtag(text)
    text = re.sub(r'[^\w]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t.lower() for t in tokens]

    lem = WordNetLemmatizer()
    if not with_stopwords:
        s_tokens = [t for t in tokens if re.match(
            r"\w+", t) and t not in stop_words]
        s_tokens = [lem.lemmatize(t) for t in s_tokens]
        if len(s_tokens) > 0:
            return s_tokens

    # return [lem.lemmatize(t) for t in tokens]
    return [t for t in tokens]

def tokenize_synonyms(text):
    synsets = []
    tokens = tokenize(text)
    for token in tokens:
        synsetss = wn.synsets(token)
        s_set = []
        for s in synsetss:
            s_set.append(s.lemmas()[0].name().lower())
        s_set.sort()
        if len(s_set) > 0:
            synsets += s_set[0].split("_")

    return synsets

def tokenize_hypernyms(text):
    synsets = []
    tokens = tokenize(text)
    for token in tokens:
        synsetss = wn.synsets(token)
        h_set = []
        for s in synsetss:
            for h in s.hypernyms():
                h_set.append(h.lemmas()[0].name().lower())

        h_set.sort()
        if len(h_set) > 0:
            synsets += h_set[0].split("_")
            
    return synsets




class Bigram_Trigram_Tokenizer:
    def __init__(self):
        self.bigram_trigram_vocab = pd.read_csv('bigram_trigram_vocab_PMI.csv')

    def get_PMI_for_word(self, word):
        pmi = self.bigram_trigram_vocab[self.bigram_trigram_vocab['ngram']
                                        == word]['PMI'].values
        if len(pmi) == 0:
            return 0

        return pmi[0]

    def tokenize_with_bigrams(self, text):
        unigrams = tokenize(text)
        bigrams = [' '.join(t)
                   for t in list(zip(unigrams, unigrams[1:]+[" "]))]
        bigrams_pmi = [self.get_PMI_for_word(word) for word in bigrams]

        def helper(left_start, right_end):
            if left_start >= right_end:
                return ''
            max_bigram_arg = np.argmax(
                bigrams_pmi[left_start:right_end]) + left_start
            if bigrams_pmi[max_bigram_arg] > 0:
                left = helper(left_start, max_bigram_arg)
                right = helper(max_bigram_arg+2, right_end)
                bi_unigram = '_'.join(bigrams[max_bigram_arg].split(' '))
                return ' '.join([left, bi_unigram, right])
            else:
                return ' '.join(unigrams[left_start:right_end])

        ret = helper(0, len(unigrams))
        return nltk.word_tokenize(ret)

    def get_bigram_trigram_token_list(self):
        return self.bigram_trigram_vocab['token'].values

    def get_bigram_token_list(self):
        df = self.bigram_trigram_vocab.copy()
        df['len'] = df['ngram'].apply(lambda x: len(x.split(' ')))
        bigrams = df[df['len'] == 2]
        return bigrams['token'].values

    def get_bigram_glove_embeddings(self):
        embeddings = pickle.load(open('mitten_bigram_dict_300d_515_10000.pkl', 'rb'))
        return embeddings

BigramTokenizer = Bigram_Trigram_Tokenizer()
bigram_dict = BigramTokenizer.get_bigram_glove_embeddings()

#Transform headline to Glove vectors. Different tokenize methods may be used.
def preprocess_X(s):
    matrix = []
    words = str(s).lower()
    wordsList = words
    words = tokenize(words, with_stopwords=True)
    #words = tokenize_hypernyms(wordsList)
    #words = BigramTokenizer.tokenize_with_bigrams(wordsList)
    
    wordMatrix = []
    for w in words:
        if w in em_dict:
            matrix.append(em_dict[w])
            wordMatrix.append(w)
    matrix = np.array(matrix)
    agg = matrix.sum(axis=0)
    
    
    return np.zeros(300) if type(agg) != np.ndarray else agg / np.sqrt((agg ** 2).sum())

#Investigate the effects of Glove vectors on results
def preprocess_X_test(s):
    matrix = []
    words = str(s).lower()
    wordsList = words
    words = tokenize(words)
    wordMatrix = []
    for w in words:
        if w in em_dict:
            matrix.append(em_dict[w])
            wordMatrix.append(w)
    originalMatrix = copy.deepcopy(matrix)
    matrix = np.array(matrix)
    agg = matrix.sum(axis=0)
    if type(agg) == np.ndarray:
        length = matrix.shape[1]
        denom = np.sqrt((agg ** 2).sum())
        i = 0
        newAgg = []
        for current in originalMatrix:
            word = wordMatrix[i]
            currentVector = current
            current = current * length / denom
            vA = current
            vB = agg / denom
            cos = np.dot(vA, vB) / (np.sqrt(np.dot(vA,vA)) * np.sqrt(np.dot(vB,vB)))
            if cos >= 0.5:
                newAgg.append(currentVector)
            i = i + 1
        newAgg = np.array(newAgg)
        agg = newAgg.sum(axis = 0)
    
    return np.zeros(300) if type(agg) != np.ndarray else agg / np.sqrt((agg ** 2).sum())

In [None]:
#Original imbalanced dataset

df = pd.read_csv('train_test_split_dataset.csv')

train = df.loc[df['phase'] == 'train']
dev = df.loc[df['phase'] == 'dev']
test = df.loc[df['phase'] == 'test']

lbl_enc = preprocessing.LabelEncoder()

y = lbl_enc.fit_transform(train.category.values)

X_train = train.headline.values
X_valid = dev.headline.values
X_test = test.headline.values

Y_train = lbl_enc.transform(train.category.values)
Y_valid = lbl_enc.transform(dev.category.values)
y_test_true = lbl_enc.transform(test.category.values)

scaler = preprocessing.StandardScaler()

# Transform y labels from 0, 1, 2 to one-hot encoding
Y_train = np_utils.to_categorical(Y_train)

Y_valid = np_utils.to_categorical(Y_valid)

# Perform preprocessing for x and transform to np array
X_train = [preprocess_X(x) for x in tqdm(X_train)]
X_train = np.array(X_train)
X_train = scaler.fit_transform(X_train)

X_valid = [preprocess_X(x) for x in tqdm(X_valid)]
X_valid = np.array(X_valid)
X_valid = scaler.transform(X_valid)

model = Sequential(
    [
        Dense(300, input_dim=300, activation='relu'),
        Dropout(0.2),
        BatchNormalization(),
        Dense(300, activation='relu'),
        Dropout(0.4),
        Dense(300, activation='relu'),
        Dropout(0.4),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ]
)

es = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X_train, y=Y_train, batch_size=64,
          epochs=100, verbose=1, validation_data=(X_valid, Y_valid), callbacks=[es])



In [None]:
X_test = test.headline.values

X_test = [preprocess_X(x) for x in tqdm(X_test)]
X_test = np.array(X_test)
X_test = scaler.transform(X_test)

y_test_pred = model.predict_classes(X_test)
print(y_test_pred)
print(accuracy_score(y_test_true, y_test_pred))

In [None]:
from sklearn.metrics import f1_score

print(f1_score(y_test_true, y_test_pred,average = 'macro'))

In [None]:
y_test_pred = lbl_enc.inverse_transform(y_test_pred)

test = test.assign(preds = y_test_pred)

test.to_csv("simpleNN.csv")


In [None]:
print(train.category.value_counts())

In [None]:
TRAIN_TEST_SPLIT_FILE = 'train_test_split_dataset.csv'
CATEGORY_SUBSET = [
    "CRIME",
    "RELIGION",
    "TECH",
    "MONEY",
    "FOOD & DRINK",
    "SPORTS",
    "TRAVEL",
    "WOMEN",
    "STYLE",
    "ENTERTAINMENT",
]
import os
def balanced_train_test_split(percent_train=0.7, percent_dev=0.1, percent_test=0.2, count=4000):
    def balance_train_data(train_data, count):
        ret = None
        for cat in CATEGORY_SUBSET:
            data_of_cat = train_data[train_data['category'] == cat]
            data_of_cat = data_of_cat.sample(count, replace=True)
            if ret is None:
                ret = data_of_cat
            else:
                ret = pd.concat([ret, data_of_cat], axis=0)
        return ret

    if os.path.exists(TRAIN_TEST_SPLIT_FILE):
        data = pd.read_csv(TRAIN_TEST_SPLIT_FILE)
        train_data = data[data['phase']=='train']
        other_data = data[data['phase']!='train']
        train_data = balance_train_data(train_data, count)
        data = pd.concat([train_data, other_data], axis=0)
        return data

    data = get_dataset()
    l = len(data)
    train_num = int(l*0.7)
    dev_num = int(l*0.1)
    rnd_ind = np.arange(l)
    np.random.shuffle(rnd_ind)
    train_ind = rnd_ind[:train_num]
    dev_ind = rnd_ind[train_num:train_num+dev_num]
    test_ind = rnd_ind[train_num+dev_num:]
    data = data.reset_index()
    data['ind'] = data.index

    def change_phase(ind):
        if ind in train_ind:
            return 'train'
        elif ind in dev_ind:
            return 'dev'
        else:
            return 'test'

    data['phase'] = data['ind'].apply(change_phase)
    data = data.drop(columns=['ind'])

    data.to_csv(TRAIN_TEST_SPLIT_FILE, index=False)

    train_data = data[data['phase']=='train']
    other_data = data[data['phase']!='train']
    train_data = balance_train_data(train_data, count)
    data = pd.concat([train_data, other_data], axis=0)

    return data

def new_balanced_train_test_split(percent_train=0.7, percent_dev=0.1, percent_test=0.2, count=4000):
    def balance_train_data(train_data, count):
        ret = None
        for cat in CATEGORY_SUBSET:
            data_of_cat = train_data[train_data['category'] == cat]
            data_of_cat = data_of_cat.sample(count, replace=True)
            if ret is None:
                ret = data_of_cat
            else:
                ret = pd.concat([ret, data_of_cat], axis=0)
        return ret

    if os.path.exists(TRAIN_TEST_SPLIT_FILE):
        data = pd.read_csv(TRAIN_TEST_SPLIT_FILE)
        train_data = data[data['phase']=='train']
        test_data = data[data['phase']=='test']
        dev_data = data[data['phase']=='dev']
        train_data = balance_train_data(train_data, count)
        test_data = balance_train_data(test_data, int(count / 10))
        data = pd.concat([train_data, dev_data, test_data], axis=0)
        return data

    data = get_dataset()
    l = len(data)
    train_num = int(l*0.7)
    dev_num = int(l*0.1)
    rnd_ind = np.arange(l)
    np.random.shuffle(rnd_ind)
    train_ind = rnd_ind[:train_num]
    dev_ind = rnd_ind[train_num:train_num+dev_num]
    test_ind = rnd_ind[train_num+dev_num:]
    data = data.reset_index()
    data['ind'] = data.index

    def change_phase(ind):
        if ind in train_ind:
            return 'train'
        elif ind in dev_ind:
            return 'dev'
        else:
            return 'test'

    data['phase'] = data['ind'].apply(change_phase)
    data = data.drop(columns=['ind'])

    data.to_csv(TRAIN_TEST_SPLIT_FILE, index=False)

    train_data = data[data['phase']=='train']
    test_data = data[data['phase']=='test']
    dev_data = data[data['phase']=='dev']
    train_data = balance_train_data(train_data, count)
    test_data = balance_train_data(test_data, int(count / 10))
    data = pd.concat([train_data, dev_data, test_data], axis=0)

    return data

In [None]:
new_balanced_train_test_split()

In [None]:
df = new_balanced_train_test_split(count=10000)

train = df.loc[df['phase'] == 'train']
dev = df.loc[df['phase'] == 'dev']
test = df.loc[df['phase'] == 'test']
lbl_enc = preprocessing.LabelEncoder()

y = lbl_enc.fit_transform(train.category.values)

X_train = train.headline.values
X_valid = dev.headline.values
X_test = test.headline.values

Y_train = lbl_enc.transform(train.category.values)
Y_valid = lbl_enc.transform(dev.category.values)
y_test_true = lbl_enc.transform(test.category.values)

scaler = preprocessing.StandardScaler()
# Transform y labels from 0, 1, 2 to one-hot encoding
Y_train = np_utils.to_categorical(Y_train)

Y_valid = np_utils.to_categorical(Y_valid)

# Perform preprocessing for x and transform to np array
X_train = [preprocess_X(x) for x in tqdm(X_train)]
X_train = np.array(X_train)
X_train = scaler.fit_transform(X_train)

X_valid = [preprocess_X(x) for x in tqdm(X_valid)]
X_valid = np.array(X_valid)
X_valid = scaler.transform(X_valid)

model = Sequential(
    [
        Dense(300, input_dim=300, activation='relu'),
        Dropout(0.2),
        BatchNormalization(),
        Dense(300, activation='relu'),
        Dropout(0.4),
        Dense(300, activation='relu'),
        Dropout(0.4),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ]
)

es = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(X_train, y=Y_train, batch_size=64,
          epochs=100, verbose=1, validation_data=(X_valid, Y_valid), callbacks=[es])

X_test = [preprocess_X(x) for x in tqdm(X_test)]
X_test = np.array(X_test)
X_test = scaler.transform(X_test)

y_test_pred = model.predict_classes(X_test)
print(y_test_pred)
print(accuracy_score(y_test_true, y_test_pred))

In [None]:
from sklearn.metrics import f1_score

print(f1_score(y_test_true, y_test_pred,average = 'macro'))

In [None]:
y_test_pred = lbl_enc.inverse_transform(y_test_pred)

test = test.assign(preds = y_test_pred)

test.to_csv("simpleNN_balanced.csv")