In [0]:
import csv
import re
import codecs
import string
import sys
import nltk
import torch
import logging

import pandas as pd
import numpy as np
import networkx as nx
import scipy.sparse as sp

from unidecode import unidecode
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter, OrderedDict
from transformers import CamembertTokenizer, CamembertModel, CamembertForSequenceClassification
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from keras.models import Model
from keras.layers import Dense, Embedding, Bidirectional, LSTM, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Input, Dropout
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

nltk.download('stopwords')
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
stop_words = set(stopwords.words('french') + stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

# PATHS

In [0]:
DATA_PATH = "./data/text/text/"
PATH_TEXT_TEXT = "./data/text/text/"
EDGE_LIST_PATH = "./data/edgelist.txt"
train_path = "./data/train_noduplicates.csv" 
test_path = "./data/test.csv"

# HELPERS

In [0]:
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 29 09:33:54 2020

@author: Houcine's laptop
"""

def build_graph():
    '''Function that build a directed weighted graph from the edgelist.txt'''
    G = nx.read_weighted_edgelist(EDGE_LIST_PATH, create_using=nx.DiGraph())
    print("Number of nodes : ", G.number_of_nodes())
    print("Number of edges : ", G.number_of_edges())
    return G

def build_train_test(train_path, test_path):
    """Function that reads the train.csv and returns the train Ids and train labels
        and reads the test.csv and returns the test Ids
    """
    with open(train_path, 'r') as f:
        train_data = f.read().splitlines()
        
    train_hosts = list()
    y_train = list()
    for row in train_data:
        host, label = row.split(",")
        train_hosts.append(host)
        y_train.append(label.lower())
        
    df_train = pd.DataFrame(data= y_train, index = train_hosts, columns= ["class"]).reset_index()
    
    with open(test_path, 'r') as f:
        test_hosts = f.read().splitlines()
    df_test =  pd.DataFrame(data=[] , index = test_hosts, columns= ["class"]).reset_index()
    return df_train, df_test

def write_submission(write_path, test_hosts, model_classes_list, predicted_probas):
    """Function that writes the submission file
  there is a need to be pass it  : 
    - The path of the file to create
    - The test Ids (returned by build_train_test)
    - The classes labels as a list
    - The predicted probas for those class labels (same order)
    """
    with open(write_path, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        model_classes_list.insert(0, "Host")
        writer.writerow(model_classes_list)
        for i,test_host in enumerate(test_hosts):
            lst = predicted_probas[i,:].tolist()
            lst.insert(0, test_host)
            writer.writerow(lst)

def text_from_id(id):
    id = str(id)
    try :
        with codecs.open(DATA_PATH+id, 'r', encoding="utf-8") as f:
            text = f.readlines()
    except:
        with codecs.open(DATA_PATH+id, 'r', encoding="latin-1") as f:
            text = f.readlines()
    return text



def build_local_test(train_hosts, y_train, size_local_test=.25):
    
    local_train, local_test, local_y_train, local_y_test = train_test_split(train_hosts, y_train,
                                                                            stratify=y_train, 
                                                                            test_size=size_local_test)
    
    return local_train, local_y_train, local_test, local_y_test

def compute_score(predictions, y_true, classes_order):
    dico = {v:k for k, v in enumerate(classes_order)}
    print(dico)
    loss = 0
    for i, cla in enumerate(y_true) :
        loss -= np.log(predictions[i, dico[cla]])
    loss = loss/len(y_true)
    return loss

def compute_score_3(predictions, y_true):
    loss = 0
    for i, cla in enumerate(y_true) :
        loss -= np.log(predictions[i, cla])
    loss = loss/len(y_true)
    return loss

def normalize_adjacency(A):
    # Sets each component of the main diagonal of the adjacency matrix to 1
    n = A.shape[0]
    A = A + np.eye(n)

    # Normalizes the emerging matrix such that each row sums to 1
    D = np.sum(A, axis=1)
    A_normalized = A/D

    return A_normalized


def loglikelihood_score(y_true, predictions, classes_order):
    dico = {v:k for k, v in enumerate(classes_order)}
    loss = 0
    for i, cla in enumerate(y_true) :
        loss -= np.log(predictions[i, dico[cla]])
    loss = loss/len(y_true)
    return loss

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# PREPROCESSING

In [0]:
def join_with_SEP(text):
    """text here is a list of sentences"""
    return " SEP ".join(text)

def clean_page_from(page, rows_that_contain):
    """Removes the rows of 'page' that contain 'rows_that_contain' 
    """
    return [x for x in page if rows_that_contain not in x]

def split_by_SEP(text) :
    """split again by SEP"""
    return  text.split(" SEP ")

def remove_empty_rows(list_of_sentences):
    list_ = [row.strip() for row in list_of_sentences]
    return [row for row in list_ if len(row)>0]

def process_text(text, start_fraction=0, end_fraction=1):
    """Read the text in but keeps the sentences in shape (do not split into tokens) 
    We only read the fraction of the text between start_fraction and end_fraction (we skip the headers of the website 
    and the contacts etc that are in the end
    
    While doing this applies this first preprocessing :
        - removes accents
        - removes the "\n"
        - removes the rows containing the at least one element of the list elements_discarding_the_row
        - handling some weird cases where a website is split on two rows
        - stripping extra spaces from the start and end of each sentence
        - replace appostrophees by space (to delete stopwords further)
    """

    text_ = [unidecode(sentence.replace("\n","").lower()) for sentence in text[int(len(text)*start_fraction):int(len(text)*end_fraction)]]
    
    elements_discarding_the_row = ["tel", "mail", "http", "www", "iframe", "button", 
                               "img", "submit", "lire la suite", "cedex", "html", "css",
                               "cookies","logo", "adresse electronique","jpg","jpeg","png","div","alternate",".fr",".com"]
    
    for e in elements_discarding_the_row :
        text_ = clean_page_from(page=text_, rows_that_contain=e)
    
    text_ = [row for row in text_ if not (("&" in row) & ("=" in row))] # fragmented website
    text_ = [x.replace("'"," ").replace("’"," ") for x in text_ if x != ""]
    text_ = [row.strip() for row in text_]
    
    return text_

def replace_by_special_token(column_df) :
    
    col = column_df.str.replace("(\d{1,2} [a-z]{3,9} \d{4})|(\d{1,2}\s?/\s?\d{1,2}\s?/\s?\d{4})", " date ")
    col = column_df.str.replace("([a-z]{3,9} \d{4})", " date ")
    col = column_df.str.replace("(janvier|fevrier|mars|avril|mai|juin|juillet|aout|septembre|octobre|novembre|decembre) ", " date ")
    col = column_df.str.replace("(lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche)", " date ")
    col = col.str.replace("\d+\s?(euro|euros|eur|EUR)"," prix ")
    col = col.str.replace("\d{1,2}(:|h)\d{2}"," heure ")
    return col

def punctuation_by_space(column_df) :
    """ column_df is a column of a dataframe"""
    return column_df.str.replace(r"[{}]".format(string.punctuation+"»"), " ")

def remove_stop_words(text) :
    return " ".join([tok for tok in text.split(" ") if tok not in stop_words])

def remove_single_word_rows(text):
    return [row for row in text if len(row.split())>1]

def remove_single_characters(text):
    return [' '.join( [w for w in row.split() if len(w)>2] ) for row in text]

def filtering_most_repetitive_rows(text, L) :
    """Based on 4.2: sentence frequencies in EDA
    L is the list of tokens to discard (see 4.2 on how it is built)"""
    
    for e in L :
        text_ = clean_page_from(page=text, rows_that_contain=e)
        
    return text


# READING THE DATA

In [0]:
train_hosts, test_hosts = build_train_test(train_path, test_path)
train_hosts['text'] = train_hosts["index"].apply(text_from_id)
train_hosts["class_codes"] = pd.Categorical(train_hosts["class"]).codes

# Processing the data

## Training data

In [0]:
start_fraction = 0
end_fraction = 1

train_hosts["text_processed"] = train_hosts.text.apply(process_text, args=(start_fraction, end_fraction,)) 
train_hosts["text_processed"] = train_hosts.text_processed.apply(join_with_SEP)
train_hosts["text_processed"] = replace_by_special_token(train_hosts["text_processed"])
train_hosts["text_processed"] = punctuation_by_space(train_hosts["text_processed"])
train_hosts["text_processed"] = train_hosts.text_processed.apply(remove_stop_words)
train_hosts["text_processed"] = train_hosts.text_processed.apply(split_by_SEP)
train_hosts["text_processed"] = train_hosts.text_processed.apply(remove_empty_rows)
train_hosts["text_processed"] = train_hosts.text_processed.apply(remove_single_characters)
train_hosts["text_processed_no_single_words"] = train_hosts.text_processed.apply(remove_single_word_rows)
train_hosts["text_processed_no_dupl"] = train_hosts.text_processed.apply(lambda x : list(OrderedDict.fromkeys(x)))

dico = dict()
for cla in train_hosts["class"].unique() :
    dico[cla] = Counter(dict())
    df = train_hosts[train_hosts["class"] == cla]
    for i in range(df.shape[0]) :
        dico[cla] += Counter(df.text_processed_no_dupl.iloc[i])
    # dico[cla] = dict(dico[cla])
    dico[cla] = {k: v * df.shape[0]/train_hosts.shape[0] for k, v in dico[cla].items()}
    dico[cla] = {k: v for k, v in sorted(dico[cla].items(), reverse=True, key=lambda item: item[1])}

counter = Counter({})
for cla in train_hosts["class"].unique():
    counter+= Counter(dico[cla])
counter = {k: v for k, v in sorted(counter.items(), reverse=True, key=lambda item: item[1])}

introduced_tokens = ["date", "prix", "heure"]
LL = [k for k, v in counter.items() if v > 10]
for tok in introduced_tokens :
    if tok in LL :
        LL.remove(tok)
        
train_hosts["text_processed_2"] = train_hosts.text_processed_no_dupl.apply(filtering_most_repetitive_rows, args=(LL,))

## Test data

In [0]:
test_hosts['text'] = test_hosts["index"].apply(text_from_id)
test_hosts["text_processed"] = test_hosts.text.apply(process_text, args=(start_fraction, end_fraction,)) 
test_hosts["text_processed"] = test_hosts.text_processed.apply(join_with_SEP)
test_hosts["text_processed"] = replace_by_special_token(test_hosts["text_processed"])
test_hosts["text_processed"] = punctuation_by_space(test_hosts["text_processed"])
test_hosts["text_processed"] = test_hosts.text_processed.apply(remove_stop_words)
test_hosts["text_processed"] = test_hosts.text_processed.apply(split_by_SEP)
test_hosts["text_processed"] = test_hosts.text_processed.apply(remove_empty_rows)
test_hosts["text_processed"] = test_hosts.text_processed.apply(remove_single_characters)
test_hosts["text_processed_no_single_words"] = test_hosts.text_processed.apply(remove_single_word_rows)
test_hosts["text_processed_no_dupl"] = test_hosts.text_processed.apply(lambda x : list(OrderedDict.fromkeys(x)))
test_hosts["text_processed_2"] = test_hosts.text_processed_no_dupl.apply(filtering_most_repetitive_rows, args=(LL,))

# Splitting training data into : local train/ local test (validation)

In [0]:
local_train, local_test = train_test_split(train_hosts, random_state=55)

# Best scoring approach

In [0]:
tokenizer_ = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertModel.from_pretrained('camembert-base')
model.eval();
model.to('cuda');

## Building BERT features (Using GPU and emptying cache and variables to clear memory space)

This was not executed locally as we do not dispose of enought memory space but we launched it on Google Colab

### Local training set

In [0]:
LEN = local_train.shape[0]
for j in range(local_train.shape[0]):
    sys.stdout.write('\r'+str(j)+"/"+str(LEN))
    cla = local_train["class_codes"].iloc[j]

    txt = ". ".join(local_train.text.iloc[j])
    try :
      tokens = tokenizer_.encode(txt, add_special_tokens=True)
      SHAPE = len(tokens[1:-1])
      new_tokens = []
      for i in range(int(SHAPE/510)+1):
          min_ = min((i+1)*510,SHAPE)
          if min_ == SHAPE :
              L = [tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id]
              new_tokens.append(L + [tokenizer_.pad_token_id]*(512 - len(L)))
          else :
              new_tokens.append([tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id] )
      # new_tokens = new_tokens[:350]
      with torch.no_grad() :
          new_train_ = model(torch.tensor(new_tokens).cuda())[0][:,0,:]
      del new_tokens
      torch.cuda.empty_cache()
      if j == 0 :
        new_train = new_train_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)
        new_train_target = [cla]
      else :
        new_train = np.concatenate((new_train, 
                                    new_train_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)), 
                                   axis=0)
        new_train_target.append(cla)
    except :
      new_train = np.concatenate((new_train, np.zeros((1,768))), axis=0)
      new_train_target.extend([cla])
new_train = np.array(new_train)

### Local testing set (validation)

In [0]:
LEN = local_test.shape[0]
for j in range(local_test.shape[0]):
    sys.stdout.write('\r'+str(j)+"/"+str(LEN))
    cla = local_test["class_codes"].iloc[j]

    txt = ". ".join(local_test.text.iloc[j])
    try :
      tokens = tokenizer_.encode(txt, add_special_tokens=True)
      SHAPE = len(tokens[1:-1])
      new_tokens = []
      for i in range(int(SHAPE/510)+1):
          min_ = min((i+1)*510,SHAPE)
          if min_ == SHAPE :
              L = [tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id]
              new_tokens.append(L + [tokenizer_.pad_token_id]*(512 - len(L)))
          else :
              new_tokens.append([tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id] )
      # new_tokens = new_tokens[:350]
      with torch.no_grad() :
          new_test_ = model(torch.tensor(new_tokens).cuda())[0][:,0,:]
      del new_tokens
      torch.cuda.empty_cache()
      if j == 0 :
        new_test = new_test_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)
        new_test_target = [cla]
      else :
        new_test = np.concatenate((new_test, 
                                    new_test_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)), 
                                   axis=0)
        new_test_target.append(cla)
    except :
      new_test = np.concatenate((new_test, np.zeros((1,768))), axis=0)
      new_test_target.extend([cla])
new_test = np.array(new_test)

### Testing set

In [0]:
LEN = test_hosts.shape[0]
exceptions = []
for j in range(test_hosts.shape[0]):
    sys.stdout.write('\r'+str(j)+"/"+str(LEN))

    txt = ". ".join(test_hosts.text_processed_no_dupl.iloc[j])
    try :
      tokens = tokenizer_.encode(txt, add_special_tokens=True)
      SHAPE = len(tokens[1:-1])
      new_tokens = []
      for i in range(int(SHAPE/510)+1):
          min_ = min((i+1)*510,SHAPE)
          if min_ == SHAPE :
              L = [tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id]
              new_tokens.append(L + [tokenizer_.pad_token_id]*(512 - len(L)))
          else :
              new_tokens.append([tokenizer_.cls_token_id] + tokens[i*510:min_] + [tokenizer_.eos_token_id] )
      # new_tokens = new_tokens[:300]
      with torch.no_grad() :
          test_ = model(torch.tensor(new_tokens).cuda())[0][:,0,:]
      del new_tokens
      torch.cuda.empty_cache()
      if j == 0 :
        test = test_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)
      else :
        test = np.concatenate((test, test_.detach().cpu().numpy().mean(axis=0).reshape(1,-1)), axis=0)
    except :
        test = np.concatenate((test, np.zeros((1,768))), axis=0)
        exceptions.append(j)

test = np.array(test)

### You can also read them as we already save them into .npy objects for further uses

In [0]:
# new_train = np.load("./data/local_train_text_non_processed_BERT.npy")
# new_test = np.load("./data/local_test_text_non_processed_BERT.npy")
# new_train_target = np.load("./data/target_train_text_non_processed_BERT.npy")
# new_test_target = np.load("./data/target_local_test_text_non_processed_BERT.npy")
# test = np.load("./data/BERT_test_text_non_processed_76_133_289_304_349_437_525.npy")


## Building the classifier

In [0]:
checkpoints = ModelCheckpoint('./data/weights.hdf5', monitor="val_loss", mode="min", verbose=True, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, min_lr=0.000001)

In [0]:
def make_model(embed_size=768, loss='categorical_crossentropy', do_rate=0.3, activ="relu"):
    inp    = Input(shape=(embed_size,))

    # dense  = Dense(512, activation=activ)(inp)

    output = Dense(8, activation="softmax")(inp)
    
    model_ = Model(inputs=inp, outputs=output)
    model_.compile(loss=loss, optimizer=Adam(lr=0.0001))
    return model_

In [0]:
batch_size = 256
epochs = 1000
do_rate=0.1
activ = "tanh"
NN_clf = make_model(do_rate=do_rate, activ= activ)
NN_clf.summary()

### Fitting the classifier on : the local train data and validating on the local test

In [0]:
mapping_labels = {lab:code for code, lab in enumerate(np.unique(new_train_target))}
y_train_coded = [mapping_labels[lab] for lab in new_train_target]
y_val_coded = [mapping_labels[lab] for lab in new_test_target]
history = NN_clf.fit(new_train, to_categorical(y_train_coded), batch_size=batch_size, epochs=epochs, 
                    validation_data=[new_test, to_categorical(y_val_coded)], 
                     callbacks=[
                                # reduce_lr,
                                checkpoints ]
                     )

### Refitting the model using the hyperparameters found during the previous step (validation) on the whole training set

In [0]:
whole_train = np.concatenate((new_train, new_test),axis=0)
whole_train_target = np.concatenate((new_train_target, new_test_target))

epochs = 450

NN_clf = make_model(do_rate=do_rate, activ= activ)

mapping_labels = {lab:code for code, lab in enumerate(np.unique(whole_train_target))}
y_train_coded = [mapping_labels[lab] for lab in whole_train_target]

history = NN_clf.fit(x=whole_train,
                     y=to_categorical(y_train_coded), 
                     batch_size=batch_size, 
                     epochs=epochs, 
                     callbacks=[
                                checkpoints
                                ]
                     )

### Saving the model weights

In [0]:
NN_clf.save("./data/model_wrap_up_20_HK_Approach3_rawtext_NN_CLF.hdf5")

## Prediction 

In [0]:
predictions = NN_clf.predict(test)
write_submission("./data/wrap_up_20_HK_Approach3_rawtext_NN_CLF.csv", 
                 list(test_hosts["index"]), 
                 model_classes_list=list(np.array(train_hosts[["class","class_codes"]].drop_duplicates().sort_values(by='class_codes'))[:,0]), 
                 predicted_probas=predictions)