In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('FA-KES-Dataset.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0


In [4]:
df.shape

(804, 7)

In [5]:
df = df.drop(columns=['unit_id', 'source', 'date','location'])

In [6]:
df.shape

(804, 3)

In [7]:
df.sample(3)

Unnamed: 0,article_title,article_content,labels
226,Air raids kill 29 civilians in north Syria tow...,Air raids kill 29 civilians in north Syria tow...,0
140,Death toll in bomb attack on Syria evacuees hi...,Death toll in bomb attack on Syria evacuees hi...,0
511,French FM: all indications Syria behind chemic...,Saturday 24 August 2013 French Foreign Ministe...,0


In [8]:
df['article_title'] = df['article_title'].str.lower()
df['article_content'] = df['article_content'].str.lower()

In [9]:
import re
def remove_tags(text):
    if isinstance(text, str):
        pattern = re.compile('<.*?>')
        return pattern.sub(r'', text)
    else:
        return text

In [10]:
df['article_title'] = df['article_title'].apply(remove_tags)
df['article_content'] = df['article_content'].apply(remove_tags)

In [11]:
def remove_url(text):
    if isinstance(text, str):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        return pattern.sub(r'', text)
    else:
        return text 

In [12]:
df['article_title'] = df['article_title'].apply(remove_url)
df['article_content'] = df['article_content'].apply(remove_url)

In [13]:
import string
string.punctuation
exclude = string.punctuation

In [14]:
def remove_punc1(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', exclude))
    else:
        return text

In [15]:
df['article_title'] = df['article_title'].apply(remove_punc1)
df['article_content'] = df['article_content'].apply(remove_punc1)

In [16]:
df.head()

Unnamed: 0,article_title,article_content,labels
0,syria attack symptoms consistent with nerve ag...,wed 05 apr 2017 syria attack symptoms consiste...,0
1,homs governor says us attack caused deaths but...,fri 07 apr 2017 at 0914 homs governor says us ...,0
2,death toll from aleppo bomb attack at least 112,sun 16 apr 2017 death toll from aleppo bomb at...,0
3,aleppo bomb blast kills six syrian state tv,wed 19 apr 2017 aleppo bomb blast kills six sy...,0
4,29 syria rebels dead in fighting for key alepp...,sun 10 jul 2016 29 syria rebels dead in fighti...,0


In [17]:
from nltk.corpus import stopwords

In [18]:
def remove_stopwords1(text):
    if isinstance(text, str):
        stop_words = set(stopwords.words('english'))
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)
    else:
        return text

In [19]:
df['article_title'] = df['article_title'].apply(remove_stopwords1)
df['article_content'] = df['article_content'].apply(remove_stopwords1)

In [20]:
df.head(5)

Unnamed: 0,article_title,article_content,labels
0,syria attack symptoms consistent nerve agent use,wed 05 apr 2017 syria attack symptoms consiste...,0
1,homs governor says us attack caused deaths doe...,fri 07 apr 2017 0914 homs governor says us att...,0
2,death toll aleppo bomb attack least 112,sun 16 apr 2017 death toll aleppo bomb attack ...,0
3,aleppo bomb blast kills six syrian state tv,wed 19 apr 2017 aleppo bomb blast kills six sy...,0
4,29 syria rebels dead fighting key aleppo road,sun 10 jul 2016 29 syria rebels dead fighting ...,0


In [21]:
#tokenization
from nltk.tokenize import word_tokenize, sent_tokenize

In [22]:
# Tokenize the 'title' column
df['article_title'] = df['article_title'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)
df['article_content'] = df['article_content'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)

In [23]:
df.head(5)

Unnamed: 0,article_title,article_content,labels
0,"[syria, attack, symptoms, consistent, nerve, a...","[wed, 05, apr, 2017, syria, attack, symptoms, ...",0
1,"[homs, governor, says, us, attack, caused, dea...","[fri, 07, apr, 2017, 0914, homs, governor, say...",0
2,"[death, toll, aleppo, bomb, attack, least, 112]","[sun, 16, apr, 2017, death, toll, aleppo, bomb...",0
3,"[aleppo, bomb, blast, kills, six, syrian, stat...","[wed, 19, apr, 2017, aleppo, bomb, blast, kill...",0
4,"[29, syria, rebels, dead, fighting, key, alepp...","[sun, 10, jul, 2016, 29, syria, rebels, dead, ...",0


In [24]:
import pandas as pd
import spacy

# Ensure you have loaded the spacy model
nlp = spacy.load("en_core_web_sm")

def lemmatize_with_spacy(text):
    # Check if the input is a string
    if isinstance(text, str):
        doc = nlp(text)
        return ' '.join([token.lemma_ for token in doc])
    else:
        # Return the input as it is if it's not a string
        return text

# Assuming df is your DataFrame
# Lemmatize the 'article_title' column
df['article_title'] = df['article_title'].apply(lemmatize_with_spacy)

# Lemmatize the 'article_content' column
df['article_content'] = df['article_content'].apply(lemmatize_with_spacy)


In [25]:
df.head(5)

Unnamed: 0,article_title,article_content,labels
0,"[syria, attack, symptoms, consistent, nerve, a...","[wed, 05, apr, 2017, syria, attack, symptoms, ...",0
1,"[homs, governor, says, us, attack, caused, dea...","[fri, 07, apr, 2017, 0914, homs, governor, say...",0
2,"[death, toll, aleppo, bomb, attack, least, 112]","[sun, 16, apr, 2017, death, toll, aleppo, bomb...",0
3,"[aleppo, bomb, blast, kills, six, syrian, stat...","[wed, 19, apr, 2017, aleppo, bomb, blast, kill...",0
4,"[29, syria, rebels, dead, fighting, key, alepp...","[sun, 10, jul, 2016, 29, syria, rebels, dead, ...",0


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer(ngram_range=(2,2))

In [28]:
df['article_content'] = df['article_content'].apply(lambda x: ' '.join(x))

# Now use CountVectorizer
bow1 = cv.fit_transform(df['article_content'])

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(bow1)

In [30]:
similarity[0]

array([1.        , 0.03766515, 0.02491914, 0.00871181, 0.00847945,
       0.01892377, 0.0049539 , 0.01199175, 0.        , 0.        ,
       0.00763804, 0.00368996, 0.        , 0.00755778, 0.00441769,
       0.02222531, 0.00919251, 0.01106988, 0.01694108, 0.00396264,
       0.01570089, 0.00393262, 0.03524099, 0.00557737, 0.01002521,
       0.07465974, 0.04248671, 0.04845487, 0.        , 0.01591165,
       0.00361854, 0.01898145, 0.01659601, 0.00370228, 0.01960156,
       0.01504293, 0.00904353, 0.01327681, 0.02948473, 0.01163972,
       0.02088051, 0.0101222 , 0.009439  , 0.01375337, 0.0120447 ,
       0.0191577 , 0.00266282, 0.00708046, 0.01162372, 0.00873204,
       0.01005953, 0.01242303, 0.01186522, 0.0575314 , 0.02485773,
       0.08142796, 0.00924027, 0.04632549, 0.05930458, 0.00450432,
       0.01975652, 0.01250174, 0.01260905, 0.01347964, 0.01017891,
       0.01315941, 0.00957885, 0.00493913, 0.01588102, 0.00717995,
       0.00861269, 0.03525769, 0.08435436, 0.02560738, 0.00303

In [31]:
print(bow1[0].toarray())
print(bow1[1].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [33]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(bow1, df['labels'], test_size=0.2, random_state=42)


In [34]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

In [35]:
# Predict the labels for the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.577639751552795

Confusion Matrix:
 [[44 28]
 [40 49]]

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.61      0.56        72
           1       0.64      0.55      0.59        89

    accuracy                           0.58       161
   macro avg       0.58      0.58      0.58       161
weighted avg       0.59      0.58      0.58       161



In [36]:
# Example new reviews
new_reviews = ["In an unprecedented turn of events, sources have revealed that world leaders are planning a secret summit on Mars in 2024. This extraordinary meeting is said to be coordinated by a coalition of space agencies, including NASA and SpaceX, and is purported to discuss the future of interplanetary governance and resource allocation. According to insider information, the leaders will travel to Mars in a newly developed spacecraft capable of making the journey in under two months. This news, though unconfirmed by any official sources, has sparked widespread speculation and conspiracy theories about the intentions behind the closed-door summit and what it might mean for the future of Earth's international relations. Experts are calling for transparency, but so far, no government officials have agreed to comment on these claims"]

# Convert new reviews to TF-IDF representation (using the same tfidf vectorizer)
new_reviews_tfidf = cv.transform(new_reviews)

# Predict the classes for the new reviews
predicted_classes = nb_classifier.predict(new_reviews_tfidf)

print(predicted_classes)

[0]


In [37]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [38]:
# Predict the labels for the test set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5962732919254659

Confusion Matrix:
 [[19 53]
 [12 77]]

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.26      0.37        72
           1       0.59      0.87      0.70        89

    accuracy                           0.60       161
   macro avg       0.60      0.56      0.54       161
weighted avg       0.60      0.60      0.55       161



In [39]:
# Example new reviews
new_reviews = ["In an unprecedented turn of events, sources have revealed that world leaders are planning a secret summit on Mars in 2024. This extraordinary meeting is said to be coordinated by a coalition of space agencies, including NASA and SpaceX, and is purported to discuss the future of interplanetary governance and resource allocation. According to insider information, the leaders will travel to Mars in a newly developed spacecraft capable of making the journey in under two months. This news, though unconfirmed by any official sources, has sparked widespread speculation and conspiracy theories about the intentions behind the closed-door summit and what it might mean for the future of Earth's international relations. Experts are calling for transparency, but so far, no government officials have agreed to comment on these claims"]

# Convert new reviews to TF-IDF representation (using the same tfidf vectorizer)
new_reviews_tfidf = cv.transform(new_reviews)

# Predict the classes for the new reviews
predicted_classes = nb_classifier.predict(new_reviews_tfidf)

print(predicted_classes)

[0]


In [46]:
from sklearn import svm

In [49]:
lr = svm.SVC(kernel='poly')
lr.fit(X_train, y_train)

In [50]:
# Predict the labels for the test set
y_pred = lr.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5217391304347826

Confusion Matrix:
 [[ 5 67]
 [10 79]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.07      0.11        72
           1       0.54      0.89      0.67        89

    accuracy                           0.52       161
   macro avg       0.44      0.48      0.39       161
weighted avg       0.45      0.52      0.42       161



In [51]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [54]:
max_words = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['article_content'])
sequences = tokenizer.texts_to_sequences(df['article_content'])

In [55]:
max_sequence_length = 10
df['article_content'] = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [56]:
X_train, X_test, y_train, y_test = train_test_split(df['article_content'], df['labels'], test_size=0.2, random_state=42)

In [58]:
# Create an LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(1, activation='sigmoid'))

In [59]:
# Compile the LSTM model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
# Train the LSTM model
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1770a0da380>

In [63]:
# Make predictions using the LSTM model
lstm_predictions = (lstm_model.predict(X_test) > 0.5).astype(int)



In [64]:
# Calculate the accuracy of the LSTM model
lstm_accuracy = accuracy_score(y_test, lstm_predictions)
print(f"LSTM Accuracy: {lstm_accuracy * 100:.2f}%")

LSTM Accuracy: 50.93%


In [65]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
gru_model.add(GRU(64))
gru_model.add(Dense(1, activation='sigmoid'))

In [66]:
# Compile the GRU model
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the GRU model
gru_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1770b1d5750>

In [67]:
# Make predictions using the GRU model
gru_predictions = (gru_model.predict(X_test) > 0.5).astype(int)

# Calculate the accuracy of the GRU model
gru_accuracy = accuracy_score(y_test, gru_predictions)
print(f"GRU Accuracy: {gru_accuracy * 100:.2f}%")

GRU Accuracy: 46.58%
