In [1]:
import pandas as pd 
import numpy as np
import re
from matplotlib import pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation


In [2]:
train = pd.read_csv('train_E6oV3lV.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [4]:
y_train = train.label.values

In [93]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional, LSTM, GRU, Dense, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Embedding, SpatialDropout1D, Dropout, BatchNormalization, Conv1D, concatenate, MaxPooling2D, AveragePooling2D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras import backend as K

from sklearn.model_selection import train_test_split

In [6]:
X_train = train.tweet.values
X_test = test.tweet.values

In [7]:
max_feature = 40000
maxlen = 150
embedding_size = 300

In [8]:
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
counter = 0
f = open('/home/paperspace/Desktop/Kaggle/Embeddings/glove.840B.300d.txt')
for line in f:
    #line = line.encode('ascii', 'replace')
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    #print(word)
#    counter = counter +1
    #print(values[1:])
    #if counter==2:
        #break
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Indexing word vectors
Total 2195892 word vectors.


In [12]:
#Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)

#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [13]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    text = re.sub('user', '', text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [14]:
comments = []
for text in X_train:
    comments.append(text_to_wordlist(text, remove_stopwords=True))
    
test_comments=[]
for text in X_test:
    test_comments.append(text_to_wordlist(text, remove_stopwords=True))

In [15]:
tokenize = Tokenizer(num_words=max_feature)
tokenize.fit_on_texts(comments + test_comments)
X_train = tokenize.texts_to_sequences(comments)
X_test = tokenize.texts_to_sequences(test_comments)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

In [16]:
word_index = tokenize.word_index
nb_words = min(max_feature, len(word_index))
embedding_matrix = np.zeros((nb_words, embedding_size))

In [17]:
# create a weight matrix for words in training docs
for word, i in word_index.items():
    if i >= max_feature: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [76]:
from sklearn.metrics import f1_score

class f1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1, batch_size = 512)
            y_pred[y_pred>=0.5]=1
            y_pred[y_pred<=0.5]=0
            score = f1_score(self.y_val, y_pred)
            print("\n F1 - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [109]:
def get_model():
    inp = Input((maxlen,))
    embed = Embedding(input_dim=max_feature, output_dim=embedding_size, weights = [embedding_matrix], trainable = True)(inp)
    x = SpatialDropout1D(0.25)(embed)
    x = Bidirectional(LSTM(300, return_sequences = True))(x)
    avg = GlobalAveragePooling1D()(x)
    maxpool = GlobalMaxPooling1D()(x)
    con = concatenate([avg, maxpool])
    x = Dense(124, activation='relu')(con)
    x = Dropout(0.2)(x)    
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inp, outputs=out)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    
    return model

In [110]:
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 150, 300)     12000000    input_12[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_12 (SpatialDr (None, 150, 300)     0           embedding_12[0][0]               
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional (None, 150, 600)     1442400     spatial_dropout1d_12[0][0]       
__________________________________________________________________________________________________
global_ave

In [106]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, train_size=0.80, random_state=233)



In [111]:
batch_size = 1024
epochs = 50

In [102]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [113]:
earlyStp = EarlyStopping(patience=6)
modelChePnt = ModelCheckpoint('bi_LSTM.hdf5', save_best_only=True, verbose=1)
f1 = f1Evaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(x = X_tra, y = y_tra, epochs = epochs, validation_data=[X_val, y_val], batch_size=batch_size, callbacks=[earlyStp, modelChePnt, f1])
# model.load_weights('bi_LSTM.hdf5')

Train on 25569 samples, validate on 6393 samples
Epoch 1/50

 F1 - epoch: 1 - score: 0.715924 

Epoch 2/50

 F1 - epoch: 2 - score: 0.729295 

Epoch 3/50

 F1 - epoch: 3 - score: 0.725248 

Epoch 4/50

 F1 - epoch: 4 - score: 0.727273 

Epoch 5/50

 F1 - epoch: 5 - score: 0.721271 

Epoch 6/50

 F1 - epoch: 6 - score: 0.704094 

Epoch 7/50

 F1 - epoch: 7 - score: 0.716526 



In [114]:
y_pred = model.predict(X_test, batch_size=512, verbose=1)



In [115]:
submission = pd.read_csv('sample_submission_3Mm4cJo.csv')
submission['id'] = test.id
submission['label'] = y_pred

In [116]:
thres = 0.5
submission['label'].loc[submission['label']>=thres]=1
submission['label'].loc[submission['label']<thres]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [117]:
submission['label'] = submission.label.astype('int')
submission.head()

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0


In [118]:
submission.label.value_counts()

0    16041
1     1156
Name: label, dtype: int64

In [63]:
submission.to_csv('sub.csv', index=False)

In [119]:
test_comments[1]

' white supremacists want everyone see new  birds movie  heres'

In [93]:
y_pred[y_pred>=0.1]=0
y_pred[y_pred<=0.1]=1
Counter(y_pred.astype('int')[:, 0])

Counter({1: 17197})

In [115]:
gen = 'you might be a libtard if... #libtard  #sjw #liberal #politics '

special_character_removal.sub('', replace_numbers.sub('', gen))

'you might be a libtard if libtard  sjw liberal politics '

In [105]:
blob = TextBlob('awsm')
blob.correct()

TextBlob("was")

In [106]:
import itertools

In [112]:
''.join(''.join(s)[:2] for _,s in itertools.groupby('sooo'))

'soo'

ImportError: No module named 'preprocessor'