In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
import pandas as pd
data = pd.read_csv('./Dataset/preprocessingData.csv', encoding='utf-8')
data.head(5)

Unnamed: 0,text,category,stance
0,"['بيل', 'غيتس', 'يتلقى', 'لقاح', 'تصوير', 'الا...",celebrity,1
1,"['وزير', 'الصحة', 'لحد', 'اليوم', 'وتحديدا', '...",info_news,1
2,"['قولكن', 'رح', 'يكونو', 'اد', 'المسؤولية', 'ل...",info_news,1
3,"['وزير', 'الصحة', 'فخر', 'الدين', 'قوجة', 'يتل...",celebrity,1
4,"['وئام', 'وهاب', 'يشتم', 'الدول', 'الخليجية', ...",personal,0


In [3]:
data.groupby('category').describe()

Unnamed: 0_level_0,stance,stance,stance,stance,stance,stance,stance,stance
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
advice,67.0,0.910448,0.287694,0.0,1.0,1.0,1.0,1.0
celebrity,975.0,0.948718,0.289154,-1.0,1.0,1.0,1.0,1.0
info_news,3616.0,0.772124,0.565546,-1.0,1.0,1.0,1.0,1.0
others,167.0,0.299401,0.532271,-1.0,0.0,0.0,1.0,1.0
personal,1025.0,0.566829,0.690103,-1.0,0.0,1.0,1.0,1.0
plan,606.0,0.925743,0.314021,-1.0,1.0,1.0,1.0,1.0
requests,112.0,0.767857,0.536625,-1.0,1.0,1.0,1.0,1.0
restrictions,18.0,0.555556,0.615699,-1.0,0.0,1.0,1.0,1.0
rumors,79.0,0.126582,0.757308,-1.0,0.0,0.0,1.0,1.0
unrelated,323.0,0.074303,0.296023,-1.0,0.0,0.0,0.0,1.0


In [4]:
data.groupby('stance').describe()

Unnamed: 0_level_0,text,text,text,text,category,category,category,category
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
stance,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
-1,438,412,"['اللقاح', 'الأمريكي', 'أمن', 'ومفيد', 'أصبحت'...",4,438,9,info_news,260
0,1012,982,"['محتاجين', 'لقاح', 'يقاوم', 'الإفراط', 'التفك...",6,1012,10,info_news,304
1,5538,5168,"['حفظه', 'الله', 'يتلقى', 'الجرعة', 'الأولى', ...",37,5538,10,info_news,3052


In [6]:
pos_data = data[data['stance']==1]
neg_data = data[data['stance']==-1]
nueral_data = data[data['stance']==0]

print(pos_data.shape)
print(neg_data.shape)
print(nueral_data.shape)

(5538, 3)
(438, 3)
(1012, 3)


In [10]:
from sklearn.model_selection import train_test_split
# Unpack the data into text, category and stance
Train_X, Test_X, Train_Y, Test_Y = train_test_split(data['text'],data['stance'],stratify=data['stance'])


In [12]:
bert_encode_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_preprocess_url='https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [14]:
bert_proprocess=hub.KerasLayer(bert_preprocess_url)
bert_encoder=hub.KerasLayer(bert_encode_url)

In [15]:
def get_sentence_embeddings(sentences):
    preprocessed_text = bert_proprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
# get the sentence embedding for the sentences
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
# preprocessing the data using bert preprocess layer
preprocessed_text=bert_proprocess(text_input)

# get the bert encoder layer
outputs = bert_encoder(preprocessed_text)

# Nueral network layer
l=tf.keras.layers.Dropout(0.1,name='dropout')(outputs['pooled_output'])
l=tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l)

# construct final model
model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [22]:
METRICS=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),tf.keras.metrics.Precision(name='precision'),tf.keras.metrics.Recall(name='recall')]

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=METRICS)
history = model.fit(Train_X,Train_Y,epochs=15)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
model.evaluate(Test_X,Test_Y)



[0.5831922292709351, 0.7927876114845276, 0.8551803231239319, 1.0]

In [None]:
# y_predict = model.predict(Test_X)
# y_predicted = y_predict.flatten()