In [2]:
import pandas as pd 
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [8]:
train=pd.read_csv("train.csv", encoding='utf-8')
test=pd.read_csv("test_x.csv", encoding='utf-8')

In [4]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [5]:
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [9]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]','',text)

train['text']=train['text'].apply(alpha_num)

In [10]:
train['text']

0        He was almost choking There was so much so muc...
1                       Your sister asked for it I suppose
2         She was engaged one day as she walked in peru...
3        The captain was in the porch keeping himself c...
4        Have mercy gentlemen odin flung up his hands D...
                               ...                        
54874    Is that you Mr Smith odin whispered I hardly d...
54875    I told my plan to the captain and between us w...
54876     Your sincere wellwisher friend and sister LUC...
54877                 Then you wanted me to lend you money
54878    It certainly had not occurred to me before but...
Name: text, Length: 54879, dtype: object

In [11]:
def remove_stopwords(text):
    final_text=[]
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords=[ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [12]:
train['text']=train['text'].str.lower()
test['text']=test['text'].str.lower()
train['text']=train['text'].apply(alpha_num).apply(remove_stopwords)
test['text']=test['text'].apply(alpha_num).apply(remove_stopwords)

In [13]:
X_train=np.array([x for x in train['text']])
X_test=np.array([x for x in test['text']])
Y_train=np.array([x for x in train['author']])
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy odin',
       'wanted lend money', 'certainly not occurred said yes like'],
      dtype='<U1433')

In [14]:
vocab_size=20000
embedding_dim=16
max_length=500
padding_type='post'

In [15]:
tokenizer=Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
word_index=tokenizer.word_index

In [16]:
train_sequences=tokenizer.texts_to_sequences(X_train)
train_padded=pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
test_sequences=tokenizer.texts_to_sequences(X_test)
test_padded=pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [17]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [22]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 320,533
Trainable params: 320,533
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
num_epochs = 20
history = model.fit(train_padded, Y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/20
1372/1372 - 9s - loss: 1.5641 - accuracy: 0.2774 - val_loss: 1.5375 - val_accuracy: 0.2759
Epoch 2/20
1372/1372 - 8s - loss: 1.3914 - accuracy: 0.4216 - val_loss: 1.2569 - val_accuracy: 0.4774
Epoch 3/20
1372/1372 - 8s - loss: 1.1648 - accuracy: 0.5280 - val_loss: 1.1319 - val_accuracy: 0.5459
Epoch 4/20
1372/1372 - 8s - loss: 1.0456 - accuracy: 0.5926 - val_loss: 1.0509 - val_accuracy: 0.5765
Epoch 5/20
1372/1372 - 8s - loss: 0.9519 - accuracy: 0.6295 - val_loss: 0.9699 - val_accuracy: 0.6251
Epoch 6/20
1372/1372 - 8s - loss: 0.8768 - accuracy: 0.6629 - val_loss: 0.9149 - val_accuracy: 0.6505
Epoch 7/20
1372/1372 - 8s - loss: 0.8134 - accuracy: 0.6916 - val_loss: 0.9030 - val_accuracy: 0.6554
Epoch 8/20
1372/1372 - 8s - loss: 0.7610 - accuracy: 0.7143 - val_loss: 0.8430 - val_accuracy: 0.6797
Epoch 9/20
1372/1372 - 8s - loss: 0.7174 - accuracy: 0.7333 - val_loss: 0.8268 - val_accuracy: 0.6901
Epoch 10/20
1372/1372 - 8s - loss: 0.6781 - accuracy: 0.7501 - val_loss: 0.8245 - 

In [24]:
pred=model.predict_proba(test_padded)

Instructions for updating:
Please use `model.predict()` instead.


In [25]:
pred

array([[1.4055325e-05, 9.7361517e-01, 1.2707669e-02, 1.3642066e-02,
        2.1081298e-05],
       [3.8599399e-01, 3.6940590e-01, 8.1040360e-02, 2.7621191e-02,
        1.3593851e-01],
       [9.9727005e-01, 1.8901030e-06, 2.9401818e-08, 2.0591457e-08,
        2.7280634e-03],
       ...,
       [4.9096131e-04, 9.9949861e-01, 5.2584892e-07, 8.0052159e-06,
        1.9319948e-06],
       [1.3552414e-04, 9.9985874e-01, 1.1870093e-06, 1.0288729e-06,
        3.4828572e-06],
       [9.8929977e-01, 1.8735145e-07, 2.3299593e-07, 1.4447701e-07,
        1.0699711e-02]], dtype=float32)

In [26]:
sample_submission=pd.read_csv("sample_submission.csv", encoding='utf-8')

In [27]:
sample_submission[['0','1','2','3','4']]=pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.000014,9.736152e-01,1.270767e-02,1.364207e-02,2.108130e-05
1,1,0.385994,3.694059e-01,8.104036e-02,2.762119e-02,1.359385e-01
2,2,0.997270,1.890103e-06,2.940182e-08,2.059146e-08,2.728063e-03
3,3,0.000007,5.683448e-09,6.727761e-01,3.722217e-07,3.272162e-01
4,4,0.968405,5.021131e-04,1.466467e-04,2.696855e-02,3.977724e-03
...,...,...,...,...,...,...
19612,19612,0.000003,9.999967e-01,1.014192e-14,4.605000e-10,1.367482e-12
19613,19613,0.142574,8.062541e-08,5.921751e-06,3.114999e-11,8.574200e-01
19614,19614,0.000491,9.994986e-01,5.258489e-07,8.005216e-06,1.931995e-06
19615,19615,0.000136,9.998587e-01,1.187009e-06,1.028873e-06,3.482857e-06


In [28]:
sample_submission.to_csv('submission.csv', index=False, encoding='utf-8')