In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip /content/drive/MyDrive/shai-training-2023-a-level-2.zip

In [None]:
train=pd.read_csv('/content/Train.csv')

In [None]:
valid=pd.read_csv('/content/Valid.csv')

In [None]:
test=pd.read_csv('/content/Test.csv')

In [None]:
test.shape

In [None]:
train.head()

#EDA

In [None]:
from matplotlib import pyplot as plt
fig=plt.figure(figsize=(10,7))
plt.barh(train['label'].unique(),
         train['label'].value_counts(),
         color=['red','blue'])
plt.title('count label')

In [None]:
def movie_count(data):
  count=data.groupby('label').count()
  return count

In [None]:
train_count=movie_count(train)
train_count

In [None]:
train_count.plot.barh()
plt.title('train count')

#count of valid 

In [None]:
valid_count=movie_count(valid)
valid_count

In [None]:
valid_count.plot.barh()

#word_count

In [None]:
train['word_count']=train['text'].apply(lambda x:len(x.split(" ")))
train['word_count']

In [None]:
train['word_count'].describe()

#char_count

In [None]:
train['char_count']=train['text'].apply(len)
train['char_count']

In [None]:
train['char_count'].describe()

#vocab

In [None]:
vocab=[word for text in train['text'] for word in text.split(" ")]
vocab=Counter(vocab)
vocab

#least common

In [None]:
vocab.most_common()[:-10]

In [None]:
vocab.most_common(n=10)

In [None]:
plt.hist(train['word_count'])

In [None]:
plt.hist(train['char_count'])

In [None]:

import nltk
from nltk.corpus import (stopwords,)
nltk.download("stopwords")

In [None]:
## create function to clean up text
import string
def clean_text(text,remove_stopwords=False):

  text=text.lower()
  text=re.sub(r"<.?>" ," " ,text) #html
  text=re.sub(r"\d+", " " ,text) #number
  text=re.sub(r"\w\d\w", " " ,text) ## word with number
  text=re.sub(r"http?://\S+", " " ,text) ## remove url
  text=re.sub(r"\S*@\S*\s", " " ,text) #remove email
  text=re.sub(r"@\S*", " " ,text) #remove mension
  text=re.sub(r"#\S*", " " ,text) # remove #
  text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)#punctuattion
  text=re.sub("\s+", "  " ,text) #remove extra space

  if remove_stopwords:

    stopwords=set(stopwords.words('english'))
    words=word_tokenize(text)
    words=[word for word in words if not word in stopwords ]
    text=" ".join(words)

  return text



In [None]:
train['clean']=train['text'].apply(clean_text)

In [None]:
valid['clean']=valid['text'].apply(clean_text)

In [None]:
test['clean']=test['text'].apply(clean_text)

#Tokininzer
# vectorize text 

In [None]:
vocab_size=5000
batch_size=32
max_length=300
max_sequence=30 #max number of word in each text
embeding_dim=50 #dimention of embedded  layers

In [None]:
## create tokinizer object
tokenizer=tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size,
                                                )

tokenizer.fit_on_texts(train['clean'])

##preview the word index
word_index=tokenizer.word_index
print({k:word_index[k] for  k in list(word_index)[:10]})

#test

In [None]:
## let see how tokinizer work
text="thinking of you everything crossed turn"
seq=tokenizer.texts_to_sequences([text])
print(seq)

In [None]:
seqv=tokenizer.texts_to_sequences(train['clean'])

In [None]:
## for all
sequ=tokenizer.texts_to_sequences(valid['clean'])



#test

In [None]:
seqt=tokenizer.texts_to_sequences(test['clean'])

##Padding NLP

In [None]:
padded_seq=tf.keras.preprocessing.sequence.pad_sequences(seqv,maxlen=max_length,
                                                         padding='post')
print(padded_seq.shape)

In [None]:
padded_seqv=tf.keras.preprocessing.sequence.pad_sequences(sequ,maxlen=max_length,
                                                          padding='post')

In [None]:
print(padded_seqv.shape)

#test

In [None]:
padded_seqt=tf.keras.preprocessing.sequence.pad_sequences(seqt,maxlen=max_length,padding='post')

##standered preprocess steps

In [None]:
x_train=padded_seq
y_train=train.label
x_valid=padded_seqv
y_valid=valid.label

In [None]:
y_train=train['label'].tolist()

y_valid=valid['label'].tolist()
y_train = np.array(y_train)
y_valid = np.array(y_valid)

#test

In [None]:
def dataset_creator(x, y):
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset


train_dataset = dataset_creator(x_train, y_train)
test_dataset = dataset_creator(x_valid, y_valid)

In [None]:
# preview dataset
for x, y in train_dataset.take(1):
    print(x.shape, y.shape)
    print(x[0])
    print(y[0])

# preview dataset size
print("Train dataset size: ", len(train_dataset))
print("Test dataset size: ", len(test_dataset))

#Modeling

In [None]:
dense_model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(max_length,)),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)

dense_model.summary()

In [None]:
dense_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# train the model
dense_model.fit(
    x_train,y_train, epochs=10, batch_size=batch_size, validation_data=(x_valid,y_valid)
)

##embed modeling

In [None]:
embed_model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1,embeding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512,activation='relu'),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
embed_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# train the model
embed_model.fit(
    train_dataset, epochs=10, batch_size=batch_size, validation_data=(test_dataset)
)

#RNN

In [None]:
simple_rnn=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1,embeding_dim,input_length=max_length),
    tf.keras.layers.SimpleRNN(64,activation='relu'),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
simple_rnn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# train the model
simple_rnn.fit(
    train_dataset, epochs=10, batch_size=batch_size, validation_data=test_dataset
)

#Lstm

In [None]:
lstm_model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embeding_dim,input_length=max_length),
     tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(128, return_sequences=True),
      tf.keras.layers.LSTM(128, return_sequences=True),
       tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, activation="tanh")),

    # tf.keras.layers.GlobalAveragePooling1D(),
  
      tf.keras.layers.Dense(64,activation='relu'),
      # tf.keras.layers.Dense(.1),
      #  tf.keras.layers.Dense(64,activation='relu'),
         tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, min_delta=0.0001, patience=5, verbose=0)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0)

In [None]:




lstm_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

lstm_model.fit(train_dataset,epochs=10,validation_data=test_dataset,callbacks=[lr_reduce, es_callback])

#GRu

In [None]:
gru_model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length=max_length),
        tf.keras.layers.GRU(64, activation="tanh"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)

gru_model.summary()

In [None]:
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, min_delta=0.0001, patience=5, verbose=0)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0)

In [None]:




gru_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

gru_model.fit(train_dataset,epochs=5,validation_data=test_dataset,callbacks=[lr_reduce, es_callback])

bidrectional

In [None]:
bi_gru_model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, activation="tanh")),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)

bi_gru_model.summary()

In [None]:
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, min_delta=0.0001, patience=5, verbose=0)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0)

In [None]:




bi_gru_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

bi_gru_model.fit(train_dataset,epochs=5,validation_data=test_dataset,callbacks=[lr_reduce, es_callback])

In [None]:
x_test=padded_seqt

In [None]:
pre = np.array(gru_model.predict(x_test))

In [None]:
pred=np.round(pre,0)

In [None]:
predi=pred.astype(int)

In [None]:
predi

In [None]:
df = pd.DataFrame({
    'id':test['id'],
    'Label':predi[:,0],
})
df


In [None]:
df.to_csv("submissiongru.csv",index=False ,header = 1)