In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import string

import warnings
warnings.filterwarnings('ignore')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tweet = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

### Data Cleaning

In [3]:
df=pd.concat([tweet,test], axis=0)
df.shape

(10876, 5)

### Removing urls

In [4]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

'New competition launched :'

In [5]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

### Removing HTML tags

In [6]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [7]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [8]:
df['text']=df['text'].apply(lambda x : remove_html(x))

### Romoving Emojis

In [9]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [10]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

### Removing punctuations

In [11]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

I am a king


In [12]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

### Spelling Correction
Even if I'm not good at spelling I can correct it with python :) I will use **pyspellcheker** to do that.

In [13]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

'correct me plese'

In [14]:
#df['text']=df['text'].apply(lambda x : correct_spellings(x))

## GloVe for Vectorization
Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.

In [15]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

In [16]:
corpus=create_corpus(df)

100%|██████████| 10876/10876 [00:03<00:00, 3543.71it/s]


In [17]:
embedding_dict={}
with open('../data/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [18]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [19]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 20342


In [20]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 20342/20342 [00:00<00:00, 120708.70it/s]


In [21]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])



In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           2034300   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 100)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,076,605
Trainable params: 42,305
Non-trainable params: 2,034,300
_________________________________________________________________


In [24]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [25]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [26]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Epoch 1/15
1618/1618 - 264s - loss: 0.6898 - accuracy: 0.5707 - val_loss: 0.6724 - val_accuracy: 0.5692 - 264s/epoch - 163ms/step
Epoch 2/15
1618/1618 - 94s - loss: 0.5926 - accuracy: 0.7034 - val_loss: 0.5278 - val_accuracy: 0.7644 - 94s/epoch - 58ms/step
Epoch 3/15
1618/1618 - 91s - loss: 0.5437 - accuracy: 0.7484 - val_loss: 0.5064 - val_accuracy: 0.7785 - 91s/epoch - 56ms/step
Epoch 4/15
1618/1618 - 91s - loss: 0.5276 - accuracy: 0.7557 - val_loss: 0.4929 - val_accuracy: 0.7785 - 91s/epoch - 57ms/step
Epoch 5/15
1618/1618 - 92s - loss: 0.5188 - accuracy: 0.7600 - val_loss: 0.4804 - val_accuracy: 0.7907 - 92s/epoch - 57ms/step
Epoch 6/15
1618/1618 - 91s - loss: 0.5079 - accuracy: 0.7677 - val_loss: 0.4733 - val_accuracy: 0.7942 - 91s/epoch - 56ms/step
Epoch 7/15
1618/1618 - 90s - loss: 0.5037 - accuracy: 0.7693 - val_loss: 0.4703 - val_accuracy: 0.7960 - 90s/epoch - 56ms/step
Epoch 8/15
1618/1618 - 91s - loss: 0.5000 - accuracy: 0.7738 - val_loss: 0.4617 - val_accuracy: 0.8012 - 91s

## Making our submission

In [27]:
sample_sub=pd.read_csv('../data/sample_submission.csv')

In [30]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('../out/submission2.csv',index=False)

In [31]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
