### **Load library**

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
STOPWORDS =(stopwords.words('english'))

import en_core_web_sm
nlp = en_core_web_sm.load()

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore",category=DeprecationWarning)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, precision_score, f1_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC


### **Read the data**

In [5]:
#Read the data
df = pd.read_csv('data/Gossipcop.csv',encoding='utf-16')
df.columns

Index(['Id', 'newsText', 'Label'], dtype='object')

In [6]:
txt = df['newsText']
for i in range(len(txt)):
    if(txt[i] is np.nan):
      df.drop(index=i, axis=0, inplace=True)

print(df.shape)

(19279, 3)


In [7]:
#DataFlair - Get the labels
labels = df.Label
labels.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1.0,15089
0.0,4190


### **Pre-process**

In [14]:
# to remove HTML tag
def html_remover(data):
  beauti = BeautifulSoup(data,'html.parser')
  return beauti.get_text()

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def lemmatizer(text):
    doc = nlp(text)
    lemmatized_sentence = " ".join([token.lemma_.lower() if token.lemma_ != '-PRON-' else token.lower_ for token in doc])
    return(lemmatized_sentence)


new_txt = []

new_txt = txt.apply(lambda x: html_remover(x))
new_txt = new_txt.apply(lambda x: re.sub(r'https\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'http\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'www.\S*com\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\S*.com\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\S*@\S*',' ',x))

#remove white space
for text in new_txt:
  text = text.rstrip()

#remove digits
new_txt = new_txt.apply(lambda x: re.sub(r'\d+',' ',x))

#lowercase
new_txt = new_txt.str.lower()

new_txt = new_txt.apply(lambda x:lemmatizer(x))
new_txt = new_txt.apply(lambda x: remove_stopwords(x))
new_txt = new_txt.apply(lambda x: re.sub("[^a-z A-Z]",' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\W*\b\w{1,2}\b',' ',x))
new_txt = new_txt.replace(r'\s+', ' ', regex=True)

### **Split the dataset and TF-IDF**

In [19]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(new_txt ,labels,
                                               test_size=0.3 , random_state=0)


#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train)

tfidf_test=tfidf_vectorizer.transform(x_test)

In [20]:
x_train.shape, x_test.shape, y_test.value_counts()

((13495,),
 (5784,),
 Label
 1.0    4526
 0.0    1258
 Name: count, dtype: int64)

### **Learn classifiers**

In [21]:
#1-------------------------------------------------------------
def RandomForest(tfidf_train,y_train,tfidf_test,y_test):
  print("RandomForestClassifier")
  classifier3 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  classifier3.fit(tfidf_train, y_train)
  y_predR = classifier3.predict(tfidf_test)
  evaluate(y_test, y_predR)

#2-------------------------------------------------------------
def PassiveAggressive(tfidf_train,y_train,tfidf_test,y_test):
  print("Passive Aggressive Classifier")
  pac=PassiveAggressiveClassifier(max_iter=50)
  pac.fit(tfidf_train,y_train)
  y_pred=pac.predict(tfidf_test)
  evaluate(y_test,y_pred)

#3-------------------------------------------------------------
def SVMclassifier(tfidf_train,y_train,tfidf_test,y_test):
  print("svm Classifier")
  SVM = SVC(C=1.9, kernel='linear')
  SVM.fit(tfidf_train, y_train)
  svm_predictions = SVM.predict(tfidf_test)
  evaluate(y_test, svm_predictions)

In [22]:
def evaluate(y_test, y_pred):

  score = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='binary')
  recall = recall_score(y_test, y_pred, average= 'binary')
  score_f1 = f1_score(y_test, y_pred, average='binary')

  print(f'Accuracy: {round(score*100,2)}%')
  print('precision: %.3f' % precision)
  print('Recall: %.3f' % recall)
  print('F-Measure: %.3f' % score_f1)

  cm = confusion_matrix(y_test, y_pred)
  report = classification_report(y_test, y_pred)
  print('confusion_matrix:','\n',cm,'\n')
  print('classification_report:','\n',report,'\n')

In [23]:
print('1.RandomForest')
RandomForest(tfidf_train,y_train,tfidf_test,y_test)
print('2.PassiveAggressive')
PassiveAggressive(tfidf_train,y_train,tfidf_test,y_test)
print('3.SVM')
SVMclassifier(tfidf_train,y_train,tfidf_test,y_test)

1.RandomForest
RandomForestClassifier
Accuracy: 82.56%
precision: 0.841
Recall: 0.958
F-Measure: 0.896
confusion_matrix: 
 [[ 441  817]
 [ 192 4334]] 

classification_report: 
               precision    recall  f1-score   support

         0.0       0.70      0.35      0.47      1258
         1.0       0.84      0.96      0.90      4526

    accuracy                           0.83      5784
   macro avg       0.77      0.65      0.68      5784
weighted avg       0.81      0.83      0.80      5784
 

2.PassiveAggressive
Passive Aggressive Classifier
Accuracy: 81.45%
precision: 0.887
Recall: 0.874
F-Measure: 0.881
confusion_matrix: 
 [[ 756  502]
 [ 571 3955]] 

classification_report: 
               precision    recall  f1-score   support

         0.0       0.57      0.60      0.58      1258
         1.0       0.89      0.87      0.88      4526

    accuracy                           0.81      5784
   macro avg       0.73      0.74      0.73      5784
weighted avg       0.82      0.81