In [None]:
from keras.layers import Dropout, Dense, GRU,Input,Embedding,Flatten, MaxPooling1D, Conv1D

from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("glove.6B.50d.txt", encoding="utf8")
    for line in f:

        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)


In [None]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """

    model = Sequential()
    hidden_layer = 3
    gru_node = 32

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))


    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))


    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [None]:
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, MaxPooling1D, Conv1D
from keras.models import Sequential,Model
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.merge import Concatenate

In [None]:
def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):

    """
        def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
        word_index in word index ,
        embeddings_index is embeddings index, look at data_helper.py
        nClasses is number of classes,
        MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
        EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
    """

    model = Sequential()
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    # applying a more complex convolutional approach
    convs = []
    filter_sizes = []
    layer = 5
    print("Filter  ",layer)
    for fl in range(0,layer):
        filter_sizes.append((fl+2))

    node = 128
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    for fsz in filter_sizes:
        l_conv = Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        #l_pool = Dropout(0.25)(l_pool)
        convs.append(l_pool)

    l_merge = Concatenate(axis=1)(convs)
    l_cov1 = Conv1D(node, 5, activation='relu')(l_merge)
    l_cov1 = Dropout(dropout)(l_cov1)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(node, 5, activation='relu')(l_pool1)
    l_cov2 = Dropout(dropout)(l_cov2)
    l_pool2 = MaxPooling1D(30)(l_cov2)
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(1024, activation='relu')(l_flat)
    l_dense = Dropout(dropout)(l_dense)
    l_dense = Dense(512, activation='relu')(l_dense)
    l_dense = Dropout(dropout)(l_dense)
    preds = Dense(nclasses, activation='softmax')(l_dense)
    model = Model(sequence_input, preds)

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])



    return model

In [None]:

data_clean = pd.read_csv("train.csv")

In [None]:
data_clean = data_clean[["ID","text","label_num"]]
data_clean.head()

Unnamed: 0,ID,text,label_num
0,1,- It is not our fight - Are we not part of thi...,2
1,2,THAT'S THE DIFFERENCE BETWEEN YOU AND ME YOU...,0
2,3,- WHAT DO THE TITANIC AND THE SIXTH SENSE HAVE...,0
3,4,"""COME ON MAN, YOU KNOW THE THING.\r\nJUST ASK ...",2
4,5,"""Those who believe without reason cannot be co...",0


In [None]:

X_train = data_clean['text']

y_train = data_clean['label_num']

X_train


0       - It is not our fight - Are we not part of thi...
1        THAT'S THE DIFFERENCE BETWEEN YOU AND ME  YOU...
2       - WHAT DO THE TITANIC AND THE SIXTH SENSE HAVE...
3       "COME ON MAN, YOU KNOW THE THING.\r\nJUST ASK ...
4       "Those who believe without reason cannot be co...
                              ...                        
1986                   Yup  still fabulous THEFRESHPICKLE
1987               ZILLOW SAID YOUR HOME WAS WORTH WHAT?!
1988    ZUCKERBERG GETS MONEY AND THE ANNOYING PEOPLE ...
1989                                               #NAME?
1990                                               #NAME?
Name: text, Length: 1991, dtype: object

In [None]:
!pip install autocorrect
nltk.download('punkt')
nltk.download('wordnet')
from autocorrect import Speller
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
      for (k, v) in rule.items():
        regex = re.compile(k)
        text = regex.sub(v, text)
    text = text.rstrip()
    return text.lower()

In [None]:
nltk.download('stopwords')
stemmer = PorterStemmer()
words = stopwords.words("english")
# X_train_cleaned= X_train.apply(lambda x: " ".join([(stemmer.stem(i)) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
X_train_cleaned=[]
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

for line in X_train:
  word_tokens = word_tokenize(line)

  filtered_sentence = [w for w in word_tokens if not w in words]

  filtered_sentence = ""

  for w in word_tokens:
    if w not in words:
      w=ps.stem(w)
      lemmatizer.lemmatize(w)
      filtered_sentence=filtered_sentence+(w)+" "

  X_train_cleaned.append(text_cleaner(filtered_sentence))

X_train

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0       - It is not our fight - Are we not part of thi...
1        THAT'S THE DIFFERENCE BETWEEN YOU AND ME  YOU...
2       - WHAT DO THE TITANIC AND THE SIXTH SENSE HAVE...
3       "COME ON MAN, YOU KNOW THE THING.\r\nJUST ASK ...
4       "Those who believe without reason cannot be co...
                              ...                        
1986                   Yup  still fabulous THEFRESHPICKLE
1987               ZILLOW SAID YOUR HOME WAS WORTH WHAT?!
1988    ZUCKERBERG GETS MONEY AND THE ANNOYING PEOPLE ...
1989                                               #NAME?
1990                                               #NAME?
Name: text, Length: 1991, dtype: object

In [None]:
from sklearn.model_selection import train_test_split

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_train, y_train, test_size = 0.25, random_state = 0)

In [None]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = False,
    ngram_range=(1, 1),
    stop_words = en_stopwords)
vectorizer.fit(X_Train)

X_Train = vectorizer.transform(X_Train)
X_Test  = vectorizer.transform(X_Test)



In [None]:

X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_Train,X_Test)


model_CNN = Build_Model_CNN_Text(word_index,embeddings_index, 20)

model_CNN.fit(X_train_Glove, Y_Train,
                              validation_data=(X_test_Glove, Y_Test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

predicted = model_CNN.predict(X_test_Glove)

# print(metrics.classification_report(Y_Test, predicted))

Found 5087 unique tokens.
(1991, 500)
Total 81060 word vectors.
Filter   5
Epoch 1/10
12/12 - 30s - loss: 1.6209 - accuracy: 0.2974 - val_loss: 1.6904 - val_accuracy: 0.3554
Epoch 2/10
12/12 - 28s - loss: 1.1915 - accuracy: 0.3403 - val_loss: 1.6949 - val_accuracy: 0.3554
Epoch 3/10
12/12 - 28s - loss: 1.1676 - accuracy: 0.3416 - val_loss: 1.7295 - val_accuracy: 0.2972
Epoch 4/10
12/12 - 28s - loss: 1.1631 - accuracy: 0.3255 - val_loss: 1.6829 - val_accuracy: 0.3574
Epoch 5/10
12/12 - 28s - loss: 1.1277 - accuracy: 0.3382 - val_loss: 1.6395 - val_accuracy: 0.2972
Epoch 6/10
12/12 - 28s - loss: 1.1288 - accuracy: 0.3329 - val_loss: 1.4732 - val_accuracy: 0.3414
Epoch 7/10
12/12 - 28s - loss: 1.1225 - accuracy: 0.3510 - val_loss: 1.5003 - val_accuracy: 0.2972
Epoch 8/10
12/12 - 28s - loss: 1.1264 - accuracy: 0.3416 - val_loss: 1.4843 - val_accuracy: 0.3474
Epoch 9/10
12/12 - 28s - loss: 1.1209 - accuracy: 0.3376 - val_loss: 1.4956 - val_accuracy: 0.3353
Epoch 10/10
12/12 - 28s - loss: 1.

In [None]:
# new=Y_Test[['label_num']]
# new['label_num']=predicted
# f1_score(Y_Test,new
# type(pd.Series(predicted))
Y_Test

1292    0
1310    1
960     2
107     0
1556    2
       ..
1517    0
1408    0
1218    1
1431    0
597     2
Name: label_num, Length: 498, dtype: int64