[Thanks for useful notebook.](https://www.kaggle.com/michawilkosz/twitter-sentiment-analysis-using-tensorflow#Model-test-harness)

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', 
                   encoding = 'latin', header=None)

data = data.rename(columns={0: 'target', 1: 'id', 2: 'date', 3: 'query', 4: 'username', 5: 'content'})

data['target'] = data['target'].replace([0, 4],['Negative','Positive'])

In [None]:
data['length'] = data.content.str.split().apply(len)

data['length'].describe()

In [None]:
data['length'].quantile([0.90,0.95,0.975,0.995])

In [None]:
data.drop(['id','date','query','username','length'], axis=1, inplace=True)

In [None]:
data.target = data.target.replace({'Positive': 1, 'Negative': 0})

In [None]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
import re

english_stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(content, stem=False):
    content = re.sub(regex, ' ', str(content).lower()).strip()
    tokens = []
    for token in content.split():
        if token not in english_stopwords:
            tokens.append(stemmer.stem(token))
    return " ".join(tokens)

data.content = data.content.apply(lambda x: preprocess(x))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val =  train_test_split(data.content,data.target, test_size=0.1)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

glove_file1='../input/glove6b100dtxt/glove.6B.100d.txt'
word2vec_output_file1 = 'glove.6B.100d.txt.word2vec'
glove_loaded1=glove2word2vec(glove_file1, word2vec_output_file1)
embeddings_dictionary1 = KeyedVectors.load_word2vec_format(word2vec_output_file1, binary=False)
        
glove_file2='../input/glove6b50dtxt/glove.6B.50d.txt'
word2vec_output_file2 = 'glove.6B.50d.txt.word2vec'
glove_loaded2=glove2word2vec(glove_file2, word2vec_output_file2)
embeddings_dictionary2 = KeyedVectors.load_word2vec_format(word2vec_output_file2, binary=False)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

embeddings_matrix1 = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    if word in embeddings_dictionary1.index2entity:
        embedding_vector = embeddings_dictionary1[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        embeddings_matrix1[index] = embedding_vector
        
embeddings_matrix2 = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    if word in embeddings_dictionary2.index2entity:
        embedding_vector = embeddings_dictionary2[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        embeddings_matrix2[index] = embedding_vector

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import unique_labels

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.optimizers import Adam
import tensorflow as tf


class TemplateClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self,vocab_size=0, max_length=50, epochs = 10, embedding = 'glove100d'):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.epochs = epochs
        self.embedding = embedding
        
    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        X_train, X_val, y_train, y_val =  train_test_split(X, y, test_size=0.1)
        
        if self.vocab_size != 0:
            self.tokenizer = Tokenizer(num_words = self.vocab_size)
            self.tokenizer.fit_on_texts(X_train)
            self.vocab_size = self.vocab_size + 1
        else:
            self.tokenizer = Tokenizer()
            self.tokenizer.fit_on_texts(X_train)
            self.vocab_size = len(self.tokenizer.word_index) + 1
        
        sequences_train = self.tokenizer.texts_to_sequences(X_train)  
        sequences_val = self.tokenizer.texts_to_sequences(X_val) 
        X_train = pad_sequences(sequences_train, maxlen=self.max_length, padding='post')
        X_val = pad_sequences(sequences_val, maxlen=self.max_length, padding='post')
        
        from gensim.scripts.glove2word2vec import glove2word2vec
        from gensim.models import KeyedVectors
        
        if self.embedding == 'glove100d':
            embeddings_dictionary = embeddings_dictionary1
            embedding_dim = 100
            embeddings_matrix = embeddings_matrix1[:self.vocab_size,]

        elif self.embedding == 'glove50d':
            embeddings_dictionary = embeddings_dictionary2
            embedding_dim = 50
            embeddings_matrix = embeddings_matrix2[:self.vocab_size,]
        
        embedding_layer = tf.keras.layers.Embedding(self.vocab_size, embedding_dim, 
                                                    input_length=self.max_length, 
                                                    weights=[embeddings_matrix], trainable=False)
    
        self.model = Sequential([
            embedding_layer,
            tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True)),
            tf.keras.layers.Bidirectional(LSTM(128)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(1, activation='sigmoid'),
            ])
        self.model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001),
                      metrics=['accuracy'])
        
        self.model.fit(X_train, y_train, epochs = self.epochs, batch_size = 1000,
              validation_data=(X_val, y_val),
              callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])

    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self)
     
        sequences_train = self.tokenizer.texts_to_sequences(X)  
        X_val = pad_sequences(sequences_train, maxlen=self.max_length, padding='post')

        y = self.model.predict(X_val)
        
        return y

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

class TemplateClassifier2(BaseEstimator, ClassifierMixin):

    #def __init__(self):
        
    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(self.X_)
        
        self.classifier = LogisticRegression()
        self.classifier.fit(X,y)

    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self)
    
        X = self.vectorizer.transform(X)  
        y = self.classifier.predict(X)
        
        return y

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
estimators = [
    ('g100d', TemplateClassifier(vocab_size=1000, max_length=30 ,embedding = 'glove100d')),
    ('g50d', TemplateClassifier(max_length=20, embedding = 'glove50d')),
    ('g50d2', TemplateClassifier(vocab_size=500, max_length=20, embedding = 'glove50d')),
    ('lr', TemplateClassifier2())
]

reg = StackingClassifier(
     estimators=estimators,
     final_estimator=GradientBoostingClassifier(),
     cv=2
)

reg.fit(X_train,y_train)

final_predictions = reg.predict(X_val)

In [None]:
sum(final_predictions == y_val)/y_val.shape[0]