In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:

import gc
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, layers
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [3]:
data_dir = Path('/content/drive/MyDrive/dacon/input/')
feature_dir = Path('../build/feature')
val_dir = Path('/content/drive/MyDrive/dacon/build/val')
tst_dir = Path('/content/drive/MyDrive/dacon/build/tst')
sub_dir = Path('/content/drive/MyDrive/dacon/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [4]:
algo_name = 'mta'
feature_name = 'emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [5]:
train = pd.read_csv(trn_file, index_col=0)
train.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [6]:
test = pd.read_csv(tst_file, index_col=0)
test.head()

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [7]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [8]:
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879,)


In [9]:
train[train['text'].str.contains("semicolons")]

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1


In [10]:
#기회갇 되면 &포함시켜서 훈련시켜보기
# -은 한단어로 표현된 경우도 있으니 놔두기.
import string

def remove_punctuations(text):
    punc=[]
    #punc.append('!')
    punc.append('.')
    punc.append(':')
    punc.append(",")
    punc.append(';')
    punc.append('\"')
    punc.append('“')
    punc.append('”')
    punc.append("’")
    #punc.append("?")
    punc.append("{")
    punc.append('[')
    punc.append(']')
    punc.append("}")
    punc.append('(')
    punc.append(')')
    #punc.append('&')
    #punc.append('*')
    punc.append('+')
    for punctuation in punc:
        text = text.replace(punctuation, '')
    return text

train["text"] = train['text'].str.lower().apply(remove_punctuations)
test['text'] = test['text'].str.lower().apply(remove_punctuations)

In [11]:
train.iloc[54750]

text       * * * * *
author             4
Name: 54750, dtype: object

In [12]:
train['text'] = train['text'].str.replace('\?',' quesmark ')
train['text'] = train['text'].str.replace('\!',' exclmark ')
train['text'] = train['text'].str.replace('\&',' empent ')
train['text'] = train['text'].str.replace("\*",' chstar ')

test['text'] = test['text'].str.replace('\?',' quesmark ')
test['text'] = test['text'].str.replace('\!',' exclmark ')
test['text'] = test['text'].str.replace('\&',' empent ')
test['text'] = test['text'].str.replace("\*",' chstar ')

In [13]:
train[train['text'].str.contains("colons")]

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1


In [14]:
cont_dict={"ain't": 'are not',
 "aren't": 'are not',
 "can't": 'can not',
 "can't've": 'can not have',
 "'cause": 'because',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "\'he'd": 'he would',
 "\'he'd've": 'he would have',
 "\'he'll": 'he will',
 "\'he'll've": 'he will have',
 "\'he's": 'he is',
 "\'how'd": 'how did',
 "\'how're": 'how are',
 "\'how'd'y": 'how do you',
 "\'how'll": 'how will',
 "\'how's": 'how is',
 "\'I'd": 'I would',
 "\'I'd've": 'I would have',
 "\'I'll": 'I will',
 "\'I'll've": 'I will have',
 "\'I'm": 'I am',
 "\'I've": 'I have',
 "\'he'd": 'he would',
 "he'd've": 'he would have',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he's": 'he is',
 "how'd": 'how did',
 "how're": 'how are',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "I'd": 'I would',
 "I'd've": 'I would have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'm": 'I am',
 "I've": 'I have',         
 "isn't": 'is not',
 "\'it'd": 'it would',
 "\'it'd've": 'it would have',
 "\'it'll": 'it will',
 "\'it'll've": 'it will have',
 "\'it's": 'it is',
 "\'let's": 'let us',
 "it'd": 'it would',
 "it'd've": 'it would have',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it's": 'it is',
 "let's": 'let us',
 "ma'am": 'madam',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "o'clock": 'of the clock',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "\'shan't": 'shall not',
 "\'sha'n't": 'shall not',
 "\'shan't've": 'shall not have',
 "\'she'd": 'she would',
 "\'she'd've": 'she would have',
 "\'she'll": 'she will',
 "\'she'll've": 'she will have',
 "\'she's": 'she is',
 "\'should've": 'should have',
 "\'shouldn't": 'should not',
 "\'shouldn't've": 'should not have',
 "shan't": 'shall not',
 "sha'n't": 'shall not',
 "shan't've": 'shall not have',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "she'll": 'she will',
 "she'll've": 'she will have',
 "she's": 'she is',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',         
 "so've": 'so have',
 "so's": 'so is',
 "\'that'd": 'that would',
 "\'that'd've": 'that would have',
 "\'that's": 'that is',
 "\'there'd": 'there would',
 "\'there'd've": 'there would have',
 "\'there's": 'there is',
 "\'they'd": 'they would',
 "\'they'd've": 'they would have',
 "\'they'll": 'they will',
 "\'they'll've": 'they will have',
 "\'they're": 'they are',
 "\'they've": 'they have',
 "that'd": 'that would',
 "that'd've": 'that would have',
 "that's": 'that is',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "there's": 'there is',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they're": 'they are',
 "they've": 'they have',         
 "to've": 'to have',
 "wasn't": 'was not',
 "\'we'd": 'we would',
 "\'we'd've": 'we would have',
 "\'we'll": 'we will',
 "\'we'll've": 'we will have',
 "\'we're": 'we are',
 "\'we've": 'we have',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we're": 'we are',
 "we've": 'we have',
 "weren't": 'were not',
 "\'what'll": 'what will',
 "\'what'll've": 'what will have',
 "\'what're": 'what are',
 "\'what's": 'what is',
 "\'what've": 'what have',
 "\'when's": 'when is',
 "\'when've": 'when have',
 "\'where'd": 'where did',
 "\'where's": 'where is',
 "\'where've": 'where have',
 "\'who'll": 'who will',
 "\'who'll've": 'who will have',
 "\'who's": 'who is',
 "\'who've": 'who have',
 "\'why's": 'why is',
 "\'why've": 'why have',
 "\'will've": 'will have',
 "\'won't": 'will not',
 "\'won't've": 'will not have',
 "\'would've": 'would have',
 "\'wouldn't": 'would not',
 "\'wouldn't've": 'would not have',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "what're": 'what are',
 "what's": 'what is',
 "what've": 'what have',
 "when's": 'when is',
 "when've": 'when have',
 "where'd": 'where did',
 "where's": 'where is',
 "where've": 'where have',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who's": 'who is',
 "who've": 'who have',
 "why's": 'why is',
 "why've": 'why have',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "\'you'd": 'you would',
 "\'you'd've": 'you would have',
 "\'you'll": 'you will',
 "\'you'll've": 'you shall have',
 "\'you're": 'you are',
 "\'you've": 'you have',
 "you'd": 'you would',
 "you'd've": 'you would have',
 "you'll": 'you will',
 "you'll've": 'you shall have',
 "you're": 'you are',
 "you've": 'you have',
 'jan.': 'january',
 'feb.': 'february',
 'mar.': 'march',
 'apr.': 'april',
 'jun.': 'june',
 'jul.': 'july',
 'aug.': 'august',
 'sep.': 'september',
 'oct.': 'october',
 'nov.': 'november',
 'dec.': 'december',
 'ain’t': 'are not',
 'aren’t': 'are not',
 'can’t': 'can not',
 'can’t’ve': 'can not have',
 '’cause': 'because',
 'could’ve': 'could have',
 'couldn’t': 'could not',
 'couldn’t’ve': 'could not have',
 'didn’t': 'did not',
 'doesn’t': 'does not',
 'don’t': 'do not',
 'hadn’t': 'had not',
 'hadn’t’ve': 'had not have',
 'hasn’t': 'has not',
 'haven’t': 'have not',
 '\'he’d': 'he would',
 '\'he’d’ve': 'he would have',
 '\'he’ll': 'he will',
 '\'he’ll’ve': 'he will have',
 '\'he’s': 'he is',
 '\'how’d': 'how did',
 '\'how’re': 'how are',
 '\'how’d’y': 'how do you',
 '\'how’ll': 'how will',
 '\'how’s': 'how is',
 '\'I’d': 'I would',
 '\'I’d’ve': 'I would have',
 '\'I’ll': 'I will',
 '\'I’ll’ve': 'I will have',
 '\'I’m': 'I am',
 '\'I’ve': 'I have',
 '\'isn’t': 'is not',
 '\'it’d': 'it would',
 '\'it’d’ve': 'it would have',
 '\'it’ll': 'it will',
 '\'it’ll’ve': 'it will have',
 '\'it’s': 'it is',
 '\'let’s': 'let us',  
 'he’d': 'he would',
 'he’d’ve': 'he would have',
 'she’ll': 'he will',
 'he’ll’ve': 'he will have',
 'odin’s' : 'odin is',
 'joe’s' : 'joe is',
 'dora’s' : 'dora is',
 'wickfield’s' : 'wickfield is',
 'tellson’s' : 'tellson is',
 'omer’s' :  'omer is',
 'cruncher’s' : 'crucher is', 
 'pip’s' : 'pip is',
 'creakle’s ': 'creakle is',
 'jorkins’s ' : 'jorkins is',
 'jane’s' : 'jane is',
 'elliot’s' : 'elliot is',
 'anne’s' : 'anne is',
 'tilney’s' : 'tilney is',
 'lizzy’s' : 'lizzy is',
 'smith’s' : 'smith is',
 'walter’s' : 'walter is',
 'musgrove’s' : 'musgrove is',
 'lucy’s' : 'lucy is',
 'nigel’s' : 'nigel is',
 'nay’s' : 'nay is',
 'chodinger’s' : 'chodinger is',
 'humphrey’s' : 'humphrey is',
 'jack’s' : 'jack is',
 'arthur’s': 'arthur is',
 'lana’s': 'lana is',
 'sarah’s': 'sarah is',
 'garcia’s' : 'garcia is',
 'ivan’s' : 'ivan is',
 'zossimov’s' : 'zossimov is',
 'totski’s' : 'totski is',
 'miusov’s' : 'miusov is',
 'rodya’s' : 'rodya is',
 'odin’s' : 'odin is',
 'maman’s' : 'maman is',
 'thee’s' : 'thee is',
 'ye’s' : 'ye is',
 'richard’s' : 'richard is',
 'silas’s' : 'silas is',
 'von’s': 'von is',
 'lanyon’s' : 'lanyon is',
 'jack’s' : 'jack is',
 'gunn’s' : 'gumn is',
 'nay’s' : 'nay is',  
 'rankeillor’s': 'rankeillor is',      
 'odin\'s' : 'odin is',
 'joe\'s' : 'joe is',
 'dora\'s' : 'dora is',
 'wickfield\'s' : 'wickfield is',
 'tellson\'s' : 'tellson is',
 'omer\'s' :  'omer is',
 'cruncher\'s' : 'crucher is', 
 'pip\'s' : 'pip is',
 'creakle\'s ': 'creakle is',
 'jorkins\'s ' : 'jorkins is',
 'jane\'s' : 'jane is',
 'elliot\'s' : 'elliot is',
 'anne\'s' : 'anne is',
 'tilney\'s' : 'tilney is',
 'lizzy\'s' : 'lizzy is',
 'smith\'s' : 'smith is',
 'walter\'s' : 'walter is',
 'musgrove\'s' : 'musgrove is',
 'lucy\'s' : 'lucy is',
 'nigel\'s' : 'nigel is',
 'nay\'s' : 'nay is',
 'chodinger\'s' : 'chodinger is',
 'humphrey\'s' : 'humphrey is',
 'jack\'s' : 'jack is',
 'arthur\'s': 'arthur is',
 'lana\'s': 'lana is',
 'sarah\'s': 'sarah is',
 'garcia\'s' : 'garcia is',
 'ivan\'s' : 'ivan is',
 'zossimov\'s' : 'zossimov is',
 'totski\'s' : 'totski is',
 'miusov\'s' : 'miusov is',
 'rodya\'s' : 'rodya is',
 'odin\'s' : 'odin is',
 'maman\'s' : 'maman is',
 'thee\'s' : 'thee is',
 'ye\'s' : 'ye is',
 'richard\'s' : 'richard is',
 'silas\'s' : 'silas is',
 'von\'s': 'von is',
 'lanyon\'s' : 'lanyon is',
 'jack\'s' : 'jack is',
 'gunn\'s' : 'gumn is',
 'nay\'s' : 'nay is',  
 'rankeillor\'s': 'rankeillor is',          
 '\'odin\'s' : 'odin is',
 '\'joe\'s' : 'joe is',
 '\'dora\'s' : 'dora is',
 '\'wickfield\'s' : 'wickfield is',
 '\'tellson\'s' : 'tellson is',
 '\'omer\'s' :  'omer is',
 '\'cruncher\'s' : 'crucher is', 
 '\'pip\'s' : 'pip is',
 '\'creakle\'s ': 'creakle is',
 '\'jorkins\'s ' : 'jorkins is',
 '\'jane\'s' : 'jane is',
 '\'elliot\'s' : 'elliot is',
 '\'anne\'s' : 'anne is',
 '\'tilney\'s' : 'tilney is',
 '\'lizzy\'s' : 'lizzy is',
 '\'smith\'s' : 'smith is',
 '\'walter\'s' : 'walter is',
 '\'musgrove\'s' : 'musgrove is',
 '\'lucy\'s' : 'lucy is',
 '\'nigel\'s' : 'nigel is',
 '\'nay\'s' : 'nay is',
 '\'chodinger\'s' : 'chodinger is',
 '\'humphrey\'s' : 'humphrey is',
 '\'jack\'s' : 'jack is',
 '\'arthur\'s': 'arthur is',
 '\'lana\'s': 'lana is',
 '\'sarah\'s': 'sarah is',
 '\'garcia\'s' : 'garcia is',
 '\'ivan\'s' : 'ivan is',
 '\'zossimov\'s' : 'zossimov is',
 '\'totski\'s' : 'totski is',
 '\'miusov\'s' : 'miusov is',
 '\'rodya\'s' : 'rodya is',
 '\'odin\'s' : 'odin is',
 '\'maman\'s' : 'maman is',
 '\'thee\'s' : 'thee is',
 '\'ye\'s' : 'ye is',
 '\'richard\'s' : 'richard is',
 '\'silas\'s' : 'silas is',
 '\'von\'s': 'von is',
 '\'lanyon\'s' : 'lanyon is',
 '\'jack\'s' : 'jack is',
 '\'gunn\'s' : 'gumn is',
 '\'nay\'s' : 'nay is',  
 '\'rankeillor\'s': 'rankeillor is',        
 'he’s': 'he is',
 'how’d': 'how did',
 'how’re': 'how are',
 'how’d’y': 'how do you',
 'how’ll': 'how will',
 'how’s': 'how is',
 'I’d': 'I would',
 'I’d’ve': 'I would have',
 'I’ll': 'I will',
 'I’ll’ve': 'I will have',
 'I’m': 'I am',
 'I’ve': 'I have',
 'isn’t': 'is not',
 'it’d': 'it would',
 'it’d’ve': 'it would have',
 'it’ll': 'it will',
 'it’ll’ve': 'it will have',
 'it’s': 'it is',
 'let’s': 'let us',          
 'ma’am': 'madam',
 'mayn’t': 'may not',
 'might’ve': 'might have',
 'mightn’t': 'might not',
 'mightn’t’ve': 'might not have',
 'must’ve': 'must have',
 'mustn’t': 'must not',
 'mustn’t’ve': 'must not have',
 'needn’t': 'need not',
 'needn’t’ve': 'need not have',
 'o’clock': 'of the clock',
 'oughtn’t': 'ought not',
 'oughtn’t’ve': 'ought not have',
 'shan’t': 'shall not',
 'sha’n’t': 'shall not',
 'shan’t’ve': 'shall not have',
 '\'she’d': 'she would',
 '\'she’d’ve': 'she would have',
 '\'she’ll': 'she will',
 '\'she’ll’ve': 'she will have',
 '\'she’s': 'she is',
 '\'should’ve': 'should have',
 '\'shouldn’t': 'should not',
 '\'shouldn’t’ve': 'should not have',
 '\'so’ve': 'so have',
 '\'so’s': 'so is',
 '\'that’d': 'that would',
 '\'that’d’ve': 'that would have',
 '\'that’s': 'that is',
 '\'there’d': 'there would',
 '\'there’d’ve': 'there would have',
 '\'there’s': 'there is',
 '\'they’d': 'they would',
 '\'they’d’ve': 'they would have',
 '\'they’ll': 'they will',
 '\'they’ll’ve': 'they will have',
 '\'they’re': 'they are',
 '\'they’ve': 'they have',
 'she’d': 'she would',
 'she’d’ve': 'she would have',
 'she’ll': 'she will',
 'she’ll’ve': 'she will have',
 'she’s': 'she is',
 'should’ve': 'should have',
 'shouldn’t': 'should not',
 'shouldn’t’ve': 'should not have',
 'so’ve': 'so have',
 'so’s': 'so is',
 'that’d': 'that would',
 'that’d’ve': 'that would have',
 'that’s': 'that is',
 'there’d': 'there would',
 'there’d’ve': 'there would have',
 'there’s': 'there is',
 'they’d': 'they would',
 'they’d’ve': 'they would have',
 'they’ll': 'they will',
 'they’ll’ve': 'they will have',
 'they’re': 'they are',
 'they’ve': 'they have',      
 'to’ve': 'to have',
 'wasn’t': 'was not',
 '\'we’d': 'we would',
 '\'we’d’ve': 'we would have',
 '\'we’ll': 'we will',
 '\'we’ll’ve': 'we will have',
 '\'we’re': 'we are',
 '\'we’ve': 'we have',
 'we’d': 'we would',
 'we’d’ve': 'we would have',
 'we’ll': 'we will',
 'we’ll’ve': 'we will have',
 'we’re': 'we are',
 'we’ve': 'we have',          
 'weren’t': 'were not',
 '\'what’ll': 'what will',
 '\'what’ll’ve': 'what will have',
 '\'what’re': 'what are',
 '\'what’s': 'what is',
 '\'what’ve': 'what have',
 '\'when’s': 'when is',
 '\'when’ve': 'when have',
 '\'where’d': 'where did',
 '\'where’s': 'where is',
 '\'where’ve': 'where have',
 '\'who’ll': 'who will',
 '\'who’ll’ve': 'who will have',
 '\'who’s': 'who is',
 '\'who’ve': 'who have',
 '\'why’s': 'why is',
 '\'why’ve': 'why have',
 '\'will’ve': 'will have',
 '\'won’t': 'will not',
 '\'won’t’ve': 'will not have',
 '\'would’ve': 'would have',
 '\'wouldn’t': 'would not',
 '\'wouldn’t’ve': 'would not have',
 'what’ll': 'what will',
 'what’ll’ve': 'what will have',
 'what’re': 'what are',
 'what’s': 'what is',
 'what’ve': 'what have',
 'when’s': 'when is',
 'when’ve': 'when have',
 'where’d': 'where did',
 'where’s': 'where is',
 'where’ve': 'where have',
 'who’ll': 'who will',
 'who’ll’ve': 'who will have',
 'who’s': 'who is',
 'who’ve': 'who have',
 'why’s': 'why is',
 'why’ve': 'why have',
 'will’ve': 'will have',
 'won’t': 'will not',
 'won’t’ve': 'will not have',
 'would’ve': 'would have',
 'wouldn’t': 'would not',
 'wouldn’t’ve': 'would not have',   
 'y’all': 'you all',
 'y’all’d': 'you all would',
 'y’all’d’ve': 'you all would have',
 'y’all’re': 'you all are',
 'y’all’ve': 'you all have',
 '\'you’d': 'you would',
 '\'you’d’ve': 'you would have',
 '\'you’ll': 'you will',
 '\'you’ll’ve': 'you shall have',
 '\'you’re': 'you are',
 '\'you’ve': 'you have', 
 'you’d': 'you would',
 'you’d’ve': 'you would have',
 'you’ll': 'you will',
 'you’ll’ve': 'you shall have',
 'you’re': 'you are',
 'you’ve': 'you have'
}

In [15]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def clean_contraction(text):
    words = text_to_word_sequence(text)
    words=[cont_dict[word] if word in cont_dict else word for word in words]
    clean_sent=" ".join(words)
    
    return clean_sent

train['text'] = train['text'].str.lower().apply(clean_contraction)
test['text'] = test['text'].str.lower().apply(clean_contraction)

In [16]:
train[train['text'].str.contains("'")]

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
3,the captain was in the porch keeping himself c...,4
8,'you were not here last sunday night' he said,0
31,'isn't there any help for it quesmark ' asked ...,0
35,'do you know that house quesmark ' i inquired ...,4
53,my dearest odin continued the other without at...,1
...,...,...
54850,'but not for two ma'am' rejoined mr odin in so...,0
54851,'he can not live a week the doctor says' pursu...,0
54867,my dear odin you are talking quite idly pray w...,1
54869,'you are afraid brittles' said mr odin,0


In [17]:
train['text']=train['text'].str.replace('\'s', '')
train['text']=train['text'].str.replace('’s', '')
train['text']=train['text'].str.replace("\'", '')
train['text']=train['text'].str.replace("’", '')

test['text']=test['text'].str.replace("’s",'')
test['text']=test['text'].str.replace("\'s",'')
test['text']=test['text'].str.replace("\'", '')
test['text']=test['text'].str.replace("’", '')



In [18]:
train['text']=train['text'].str.replace('á', '')
train['text']=train['text'].str.replace('ä', '')
train['text']=train['text'].str.replace('é', '')
train['text']=train['text'].str.replace('í', '')
train['text']=train['text'].str.replace('ó', '')
train['text']=train['text'].str.replace('ú', '')
train['text']=train['text'].str.replace('ý', '')
train['text']=train['text'].str.replace('ü', ' Umlaut ')

test['text']=test['text'].str.replace('ä', '')
test['text']=test['text'].str.replace('á', '')
test['text']=test['text'].str.replace('é', '')
test['text']=test['text'].str.replace('í', '')
test['text']=test['text'].str.replace('ó', '')
test['text']=test['text'].str.replace('ú', '')
test['text']=test['text'].str.replace('ý', '')
test['text']=test['text'].str.replace('ü', ' Umlaut ')

In [19]:
def alpha_num(text):
    return re.sub(r'[0-9]', ' num ', text)

def remove_word(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in del_word:
            final_text.append(i.strip())
    return " ".join(final_text)


del_word = ['the', 'and' , 'to' , 'of' , 'a']

train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_word)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_word)

In [20]:
vocab_size = 20000
maxlen = 230
embed_dim = 64
num_heads = 4  # Number of attention heads
padding_type='post'

In [21]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [22]:

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [23]:
trn = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=maxlen)
tst = keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=maxlen)
print(trn.shape, tst.shape)

(54879, 230) (19617, 230)


In [24]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [25]:
def get_model():
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer

    inputs = layers.Input(shape=(maxlen,))
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(n_class, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.001))
    return model

In [26]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = get_model()
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=10,
            batch_size=128,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(trn[i_val])
    p_tst += clf.predict(tst) / n_fold
    
    clear_session()
    gc.collect()


training model for CV #1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
training model for CV #2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
training model for CV #3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
training model for CV #4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
training model for CV #5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 00006: early stopping


In [27]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Accuracy (CV):  75.2492%
Log Loss (CV):   0.6948


In [None]:
#     max_length        vocab_size       num_heads         accuracy             
       



In [28]:

np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [29]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [30]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0146,0.2144,0.7262,0.0389,0.0058
1,0.225,0.1587,0.0601,0.0338,0.5224
2,0.983,0.0062,0.0042,0.0002,0.0064
3,0.0671,0.0019,0.8657,0.0013,0.064
4,0.8831,0.0239,0.0079,0.0768,0.0083


In [31]:
sub.to_csv(sub_file)