In [1]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset
import pandas as pd

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  2


In [3]:
strategy = tf.distribute.MirroredStrategy()

In [4]:
train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [5]:
x_train=train["text"]
x_test=test["text"]


In [6]:
import re
x_train=x_train.apply(lambda x: re.sub(r'http\S+|www\S+', '', x))

# Remove Hashtags
x_train =x_train.apply(lambda x: re.sub(r'#\w+', '', x))

# Remove Mentions
x_train =x_train.apply(lambda x: re.sub(r'@\w+', '', x))

# Remove extra spaces after cleaning
x_train =x_train.apply(lambda x: ' '.join(x.split()))

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x_train, train["target"], test_size=0.2, random_state=42)

In [8]:
y_val

2644    1
2227    0
5448    1
132     0
6845    0
       ..
1835    0
506     1
3592    1
6740    1
1634    0
Name: target, Length: 1523, dtype: int64

In [9]:
import keras_nlp
from transformers import BertTokenizer, TFBertModel
import keras

with strategy.scope():
    preset= "distil_bert_base_en_uncased"
    
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

    classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(1e-5),
    metrics=['accuracy']
    )

    
    
classifier.summary()

In [10]:

classifier.fit(x=X_train, y=y_train,validation_data=(X_val, y_val),epochs=3)
# Re-compile (e.g., with a new learning rate).


Epoch 1/3
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 422ms/step - accuracy: 0.6799 - loss: 0.6044 - val_accuracy: 0.8255 - val_loss: 0.4164
Epoch 2/3
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 422ms/step - accuracy: 0.8262 - loss: 0.4022 - val_accuracy: 0.8281 - val_loss: 0.4013
Epoch 3/3
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 422ms/step - accuracy: 0.8717 - loss: 0.3290 - val_accuracy: 0.8150 - val_loss: 0.4217


<keras.src.callbacks.history.History at 0x7ea2387dfc70>

In [11]:
import re
x_test=x_test.apply(lambda x: re.sub(r'http\S+|www\S+', '', x))

# Remove Hashtags
x_test =x_test.apply(lambda x: re.sub(r'#\w+', '', x))

# Remove Mentions
x_test =x_test.apply(lambda x: re.sub(r'@\w+', '', x))

# Remove extra spaces after cleaning
x_test =x_test.apply(lambda x: ' '.join(x.split()))

In [12]:
prediction=classifier.predict(x=test["text"], batch_size=2)

[1m1632/1632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 14ms/step


In [13]:
prediction.shape

(3263, 2)

In [14]:
import numpy as np
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = np.argmax(prediction, axis=1)

In [15]:
sample_submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [16]:

sample_submission.to_csv("submission.csv", index=False)