In [17]:
import numpy as np
import pandas as pd
import re, string
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots

In [18]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [19]:
from sklearn import preprocessing, decomposition, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [20]:
enc = preprocessing.LabelEncoder()
y = enc.fit_transform(train.author.values)

In [21]:
def punct_pct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

In [22]:
X = train.drop(['id','author'], axis=1)
X["len"] = train.text.apply(lambda x: len(x) - x.count(" "))
X["punct%"] = train["text"].apply(lambda x: punct_pct(x))

In [23]:
X.head()

Unnamed: 0,text,len,punct%
0,"This process, however, afforded me no means of...",191,3.7
1,It never once occurred to me that the fumbling...,58,1.7
2,"In his left hand was a gold snuff box, from wh...",165,3.0
3,How lovely is spring As we looked from Windsor...,173,2.3
4,"Finding nothing else, not even gold, the Super...",148,2.7


In [24]:
def punct_pct2(text, punc):
    count = sum([1 for char in text if char==punc])
    return round(count/(len(text) - text.count(" ")), 3)*100

In [25]:
X["comma%"] = X.text.apply(lambda x: punct_pct2(x, ','))
X["semicolon%"] = X.text.apply(lambda x: punct_pct2(x, ';'))
X["colon%"] = X.text.apply(lambda x: punct_pct2(x, ':'))
X["question%"] = X.text.apply(lambda x: punct_pct2(x, '?'))
X["exclamation%"] = X.text.apply(lambda x: punct_pct2(x, '!'))
X.head()

Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%,exclamation%
0,"This process, however, afforded me no means of...",191,3.7,2.1,1.0,0.0,0.0,0.0
1,It never once occurred to me that the fumbling...,58,1.7,0.0,0.0,0.0,0.0,0.0
2,"In his left hand was a gold snuff box, from wh...",165,3.0,2.4,0.0,0.0,0.0,0.0
3,How lovely is spring As we looked from Windsor...,173,2.3,1.7,0.0,0.0,0.0,0.0
4,"Finding nothing else, not even gold, the Super...",148,2.7,1.4,0.7,0.0,0.0,0.0


In [26]:
X.describe()

Unnamed: 0,len,punct%,comma%,semicolon%,colon%,question%,exclamation%
count,19579.0,19579.0,19579.0,19579.0,19579.0,19579.0,19579.0
mean,123.326932,3.458471,1.486674,0.177016,0.020251,0.094515,0.0
std,87.952007,2.444214,1.313902,0.418833,0.159751,0.506201,0.0
min,17.0,0.1,0.0,0.0,0.0,0.0,0.0
25%,67.0,2.1,0.4,0.0,0.0,0.0,0.0
50%,106.0,2.9,1.3,0.0,0.0,0.0,0.0
75%,158.0,4.1,2.2,0.0,0.0,0.0,0.0
max,3803.0,53.3,16.7,7.1,5.9,9.4,0.0


In [27]:
X.drop(['exclamation%'], axis = 1, inplace=True)
X.head()

Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%
0,"This process, however, afforded me no means of...",191,3.7,2.1,1.0,0.0,0.0
1,It never once occurred to me that the fumbling...,58,1.7,0.0,0.0,0.0,0.0
2,"In his left hand was a gold snuff box, from wh...",165,3.0,2.4,0.0,0.0,0.0
3,How lovely is spring As we looked from Windsor...,173,2.3,1.7,0.0,0.0,0.0
4,"Finding nothing else, not even gold, the Super...",148,2.7,1.4,0.7,0.0,0.0


In [28]:
X.shape

(19579, 7)

In [29]:
pd.Series(y).value_counts()

0    7900
2    6044
1    5635
dtype: int64

In [30]:
from sklearn.model_selection import train_test_split
# ensure the % distribution of different author in the training set and validation set after the split
xTrain, xVal, yTrain, yVal = train_test_split(X, y, stratify=y, random_state=1, test_size=0.2, shuffle=True)

In [31]:
xTrain.shape, xVal.shape

((15663, 7), (3916, 7))

# Building Models

In [32]:
import nltk

In [33]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [34]:
wn.lemmatize("abandoned"), ps.stem("abandoned")

('abandoned', 'abandon')

In [35]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    stopwords = nltk.corpus.stopwords.words('english')
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

In [36]:
xTrain.head()

Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%
16782,The youth's eyes glistened and his nostrils cu...,107,1.9,0.0,0.0,0.0,0.0
1340,At fust the things didn't never go onto the ma...,76,3.9,1.3,0.0,0.0,0.0
435,Its persistence among a simple people was quit...,242,1.2,0.8,0.0,0.0,0.0
10571,From this time a new spirit of life animated t...,66,1.5,0.0,0.0,0.0,0.0
11242,"To speak the truth, I had no especial relish f...",275,3.6,2.2,0.7,0.0,0.0


In [37]:
xTrain["clean_text"] = xTrain['text'].apply(lambda x: clean_text(x))
xTrain.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%,clean_text
16782,The youth's eyes glistened and his nostrils cu...,107,1.9,0.0,0.0,0.0,0.0,youth eye glisten nostril curl fume brownish f...
1340,At fust the things didn't never go onto the ma...,76,3.9,1.3,0.0,0.0,0.0,fust thing didnt never go onto main island art...


In [38]:
xVal["clean_text"] = xVal['text'].apply(lambda x: clean_text(x))
xVal.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%,clean_text
17400,There will be frequent hours in which I shall ...,84,3.6,2.4,0.0,0.0,0.0,frequent hour shall need sympathi poetic done
11261,"She has now him in hers since, being unaware t...",109,2.8,1.8,0.0,0.0,0.0,sinc unawar letter possess proceed exact


In [16]:
# # ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english')
# ctv = CountVectorizer(ngram_range=(1, 3))

# # Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
# ctv.fit(list(xTrain.clean_text) + list(xVal.clean_text))
# xtrain_ctv = ctv.transform(xTrain.clean_text) 
# xvalid_ctv = ctv.transform(xVal.clean_text)
# print(xtrain_ctv.shape)
# xTrain2= xTrain.reset_index(drop=True)
# print(xTrain2.head(2))

NameError: name 'xtrain_ctv' is not defined

In [28]:
# xtrain_ctv2 = pd.DataFrame(xtrain_ctv.toarray())
# xtrain_ctv2.columns = ctv.get_feature_names()
# print(xtrain_ctv2.shape)
# xtrain_ctv2.head()

(5547, 9781)


Unnamed: 0,abaft,abandon,abaout,abat,abdic,abdul,abernethi,aberr,abey,abhor,...,zest,zigzag,zit,zobna,zobnarian,zokkar,zone,zuro,æronaut,ærostat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# XTrain = pd.concat([xTrain2, xtrain_ctv2], axis=1)
# XTrain.shape

(5547, 9789)

In [30]:
# xval_ctv2 = pd.DataFrame(xvalid_ctv.toarray())
# xval_ctv2.columns = ctv.get_feature_names()
# xval_ctv2.head()

Unnamed: 0,abaft,abandon,abaout,abat,abdic,abdul,abernethi,aberr,abey,abhor,...,zest,zigzag,zit,zobna,zobnarian,zokkar,zone,zuro,æronaut,ærostat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# xVal2= xVal.reset_index(drop=True)
# print(xVal2.shape)
# xVal2.head()

(979, 8)


Unnamed: 0,text,len,punct%,comma%,semicolon%,colon%,question%,clean_text
0,But even in this acute moment my chief horror ...,91,1.1,0.0,0.0,0.0,0.0,even acut moment chief horror someth apart imm...
1,"She went on, feeling that, if she had paused f...",246,3.3,2.8,0.0,0.0,0.0,went feel paus moment check water miseri would...
2,A little reading of the 'Dial' will carry you ...,47,6.4,0.0,0.0,0.0,0.0,littl read dial carri great way
3,Sometimes I sat with my eyes fixed on the grou...,118,1.7,0.8,0.0,0.0,0.0,sometim sat eye fix ground fear rais lest enco...
4,"I now lamented that my great elevation would, ...",94,3.2,2.1,0.0,0.0,0.0,lament great elev would case prevent take accu...


In [32]:
# Xval = pd.concat([xVal2, xval_ctv2], axis=1)
# Xval.shape

(979, 9789)

In [36]:
# XTrain.drop(['text','clean_text'],axis=1,inplace=True)
# Xval.drop(['text','clean_text'],axis=1,inplace=True)

In [39]:
from tqdm import tqdm

# load the GloVe vectors in a dictionary:
embeddings_index = {}
f = open('glove.840B.300d.txt','r', errors = 'ignore', encoding='utf8')
for line in tqdm(f):
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [05:19, 6883.17it/s]


Found 2195892 word vectors.


In [40]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    
    words = str(s).lower()
#     words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [41]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xTrain.clean_text)]
xvalid_glove = [sent2vec(x) for x in tqdm(xVal.clean_text)]
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

100%|███████████████████████████████████████████████████████████████████████████| 15663/15663 [00:16<00:00, 947.91it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3916/3916 [00:03<00:00, 989.12it/s]


In [42]:
from sklearn.preprocessing import StandardScaler

# scale the data before any neural net:
scl = StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [43]:
from keras.utils import np_utils
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(yTrain)
yvalid_enc = np_utils.to_categorical(yVal)

Using TensorFlow backend.


In [44]:
# using keras tokenizer here
from keras.preprocessing import sequence, text
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xTrain.clean_text) + list(xVal.clean_text))
xtrain_seq = token.texts_to_sequences(xTrain.clean_text)
xvalid_seq = token.texts_to_sequences(xVal.clean_text)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [47]:
print(xtrain_pad.shape)
xtrain_pad[:2]

(15663, 70)


array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,   373,     9,  2861,
         4099,  2988,  3824,  5656,  4100,   341,  1015],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     

In [None]:
# Consider turning this array into dataframe and combining with the punctuation % characteristics mentioned above

In [48]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|████████████████████████████████████████████████████████████████████████| 15586/15586 [00:00<00:00, 106160.60it/s]


In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping


model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.5, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.5, recurrent_dropout=0.3))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, restore_best_weights=True)
model.fit(x=xtrain_pad, y=ytrain_enc, validation_data=(xvalid_pad, yvalid_enc),
          batch_size=64, epochs=200, verbose=1, callbacks=[earlystop])

Train on 15663 samples, validate on 3916 samples
Epoch 1/200
Epoch 2/200


Epoch 3/200


Epoch 4/200




In [60]:
test["clean_text"] = test['text'].apply(lambda x: clean_text(x))
test.head()

Unnamed: 0,id,text,clean_text
0,id02310,"Still, as I urged our leaving Ireland with suc...",still urg leav ireland inquietud impati father...
1,id24541,"If a fire wanted fanning, it could readily be ...",fire want fan could readili fan newspap govern...
2,id00134,And when they had broken down the frail door t...,broken frail door found two cleanli pick human...
3,id27757,While I was thinking how I should possibly man...,think possibl manag without one actual tumbl h...
4,id04081,I am not sure to what limit his knowledge may ...,sure limit knowledg may extend


In [None]:
# # create sentence vectors for test set 
# test_glove = np.array([sent2vec(x) for x in tqdm(test.clean_text)])
# test_glove_scl = scl.transform(test_glove)

In [61]:
test_seq = token.texts_to_sequences(test.clean_text)
test_pad = sequence.pad_sequences(test_seq, maxlen=max_len)

In [64]:
test_predictions = model.predict(test_pad)
test_predictions[:4]

array([[0.3115824 , 0.26607314, 0.4223445 ],
       [0.46715587, 0.28526133, 0.2475828 ],
       [0.35904002, 0.5713203 , 0.0696396 ],
       [0.2935729 , 0.5764069 , 0.1300202 ]], dtype=float32)

In [68]:
data = pd.concat([test.id,pd.DataFrame(test_predictions)], axis=1)
data.head()

Unnamed: 0,id,0,1,2
0,id02310,0.311582,0.266073,0.422345
1,id24541,0.467156,0.285261,0.247583
2,id00134,0.35904,0.57132,0.06964
3,id27757,0.293573,0.576407,0.13002
4,id04081,0.623533,0.171619,0.204848


In [72]:
data.columns=["id","EAP","HPL","MWS"]
data.to_csv("submit.csv", index=False)