In [None]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
import pandas as pd
import re
import string
from collections import Counter
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")

submission = pd.read_csv("sample_submission.csv")

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    mention = re.compile(r'@\S+')
    translator = str.maketrans('', '', string.punctuation)
    text = url.sub(r'', text)
    text = mention.sub(r'', text)
    text = text.translate(translator)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if  not word in stop_words]
    text = ' '.join(text)
    return text


In [None]:
dfTrain['text'] = dfTrain['text'].apply(lambda x: clean_text(x))

dfTest['text'] = dfTest['text'].apply(lambda x: clean_text(x))

In [None]:
def counter_word(train_text, test_text):
    count = Counter()
    for text in train_text.values:
        for word in text.split():
            count[word] += 1
    for text in test_text.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(dfTrain['text'], dfTest['text'])

In [None]:
num_unique_words = len(counter)

In [None]:
train_size = int(len(dfTrain) * 0.95)

train_df = dfTrain[:train_size]
val_df = dfTrain[train_size:]

train_sentences = train_df['text'].to_numpy()
train_labels = train_df['target'].to_numpy()
val_sentences = val_df['text'].to_numpy()
val_labels = val_df['target'].to_numpy()

test_sentences = dfTest['text'].to_numpy()

In [None]:
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)

In [None]:
train_sentences = tokenizer.texts_to_sequences(train_sentences)
val_sentences = tokenizer.texts_to_sequences(val_sentences)

test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [None]:
max_length = max([
    max([len(x) for x in train_sentences]),
    max([len(x) for x in val_sentences]),
    max([len(x) for x in test_sequences])
])

train_padded = pad_sequences(train_sentences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sentences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
model = keras.Sequential([
    layers.Embedding(num_unique_words, 86, input_length=max_length),
    layers.Dropout(0.1),
    layers.LSTM(128, dropout=0.1, return_sequences=True, recurrent_dropout=0.2),
    layers.Dropout(0.1),
    layers.LSTM(128, dropout=0.1, recurrent_dropout=0.2),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

loss = keras.losses.BinaryCrossentropy()
optim = keras.optimizers.Adam(learning_rate=0.002)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [None]:
model.fit(
    train_padded, train_labels, epochs=4, batch_size=32,
    validation_data=(val_padded, val_labels)
)

In [None]:
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

submission['target'] = predictions
submission.to_csv('submission.csv', index=False)