## **Import Library**

In [None]:
import pandas as pd
import numpy as np


from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
STOPWORDS =(stopwords.words('english'))


import textstat
textstat.set_lang('en')
from collections import Counter
from spellchecker import SpellChecker
import en_core_web_sm
nlp = en_core_web_sm.load()


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, precision_score, f1_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC

## **Read Dataset**

In [None]:
dir_name = r"data/Gossipcop.csv"
df = pd.read_csv(dir_name,encoding='utf-16')
df = df.dropna()
print(len(df))
df.columns

19279


Index(['Id', 'newsText', 'Label'], dtype='object')

## **Preprocess**

In [None]:
old_txt = df['newsText']

In [None]:
# to remove HTML tag
def html_remover(data):
  beauti = BeautifulSoup(data,'html.parser')
  return beauti.get_text()

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def lemmatizer(text):
    doc = nlp(text)
    lemmatized_sentence = " ".join([token.lemma_.lower() if token.lemma_ != '-PRON-' else token.lower_ for token in doc])
    return(lemmatized_sentence)


new_txt = []

new_txt = old_txt.apply(lambda x: html_remover(x))
new_txt = new_txt.apply(lambda x: re.sub(r'https\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'http\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'www.\S*com\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\S*.com\S*',' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\S*@\S*',' ',x))

#remove white space
for text in new_txt:
  text = text.rstrip()

#remove digits
new_txt = new_txt.apply(lambda x: re.sub(r'\d+',' ',x))

#lowercase
new_txt = new_txt.str.lower()

new_txt = new_txt.apply(lambda x:lemmatizer(x))
new_txt = new_txt.apply(lambda x: remove_stopwords(x))
new_txt = new_txt.apply(lambda x: re.sub("[^a-z A-Z]",' ',x))
new_txt = new_txt.apply(lambda x: re.sub(r'\W*\b\w{1,2}\b',' ',x))
new_txt = new_txt.replace(r'\s+', ' ', regex=True)

  beauti = BeautifulSoup(data,'html.parser')


## **Feature Extraction**

### **1.sentiment_embedding feature**

In [None]:
def vader_score_generation(df,e):
    analyzer = SentimentIntensityAnalyzer()
    polarity_Score = e.apply(lambda x: analyzer.polarity_scores(x))
    new_df = polarity_Score.apply(pd.Series)
    return  new_df

emo_pos_dist_pscore = vader_score_generation(df,new_txt)

In [None]:
df = df.join(emo_pos_dist_pscore)

### **2.Readability Analysis feature**

In [None]:
def readability_ease(text):

  score = textstat.flesch_reading_ease(text)
  if 90 >= score >= 100 :
    ease = 9

  elif 80>= score <= 89 :
    ease = 8

  elif 70>= score <= 79 :
    ease = 7

  elif 60>= score <= 69 :
    ease = 6

  elif 50>= score <= 59 :
    ease = 5

  elif 30>= score <= 49 :
    ease = 3

  else :
    ease = 0

  return(ease)


def read_time(text):
  second = textstat.reading_time(text, ms_per_char=70)
  return(second)

"\n2.\n#score of 9.3 means that a ninth-grader would be able to read the document.\nprint('flesch_kincaid_grade')\nprint(textstat.flesch_kincaid_grade(text))\n\n3.\n#6.5, then the grade level to comprehend the text is 6th to 7th grade.\nprint('automated_readability_index')\nprint(textstat.automated_readability_index(text))\n"

In [None]:
df['readability_ease'] = new_txt.apply(lambda x:readability_ease(x))
df['read_time'] = new_txt.apply(lambda x:read_time(x))

### **3spell checker feature**

In [None]:
def spell(text):

    spell = SpellChecker()
    wordlist = text.split()
    miss = list(spell.unknown(wordlist))
    amount_miss = len(miss)
    return(amount_miss)

In [None]:
df['incorrect_spell'] = new_txt.apply(lambda x:spell(x))
len_data = new_txt.apply(lambda x:len(x))
df['incorrect_spell_freq'] = len_data / df['incorrect_spell']

### **4.Name Entity Recognition(NER) feature**

spacy library contain these entities:
 ['CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME', 'WORK_OF_ART']

In [None]:
def ner_text_count(t):
  doc = nlp(t)
  labels = [x.label_ for x in doc.ents]
  count = (len(Counter(labels).keys()))
  return(count)


def ner_text_list(text,label):
  doc = nlp(text)
  labels = [x.label_ for x in doc.ents]
  ner = [ ]
  for l in label:
    c = labels.count(l)
    ner.append(c)
  return(ner)


all_ents = nlp.get_pipe('ner').labels
all_ents = list(all_ents)
df['ner_count'] = new_txt.apply(lambda x:ner_text_count(x))

### **5.POS feature**

In [None]:
def generate_pos_tag_dist(news_text):

    counts = []
    for sentences in news_text:
        sentences = re.sub(r'[^\w\s]','',sentences)
        tokens = nltk.word_tokenize(sentences)
        tags = nltk.pos_tag(tokens)
        counts.append(Counter( tag for word,  tag in tags))

    df_pos_dist = pd.DataFrame.from_records(counts)
    df_post_dist_non_null = df_pos_dist.loc[:,df_pos_dist.columns[df_pos_dist.isnull().mean() < 0.7]].reset_index()
    df_post_dist_non_null.fillna(0,inplace=True)

    return(df_post_dist_non_null)


pos_df = generate_pos_tag_dist(new_txt)

In [None]:
def zerolistmaker(n):
    listofzeros = [0] * n
    return listofzeros


def group_pos_features(pos_tagged_df,pos_tagged_df2):

    pos_list1 = ['DT','EX','FW','IN','MD','TO','UH']
    pos_list2 = ['CC','CD','JJ','JJR','JJS','NN','NNP','NNS','NNPS','RBR','RB','RBS','VB','VBD','VBG','VBN','VBP','VBZ','PRP','PRP$','WDT','WP','WP$','WRB']

    pos_dict = {}

    for pos_l in pos_list1:
      if pos_l in pos_tagged_df.columns:
        pos_tagged_df2[pos_l] = np.array(list(pos_tagged_df[pos_l]))


    for pos_l in pos_list2:
      if pos_l in pos_tagged_df.columns:
        pos_dict[pos_l] = np.array(list(pos_tagged_df[pos_l]))
      else:
        pos_dict[pos_l] = list(zerolistmaker(len(pos_tagged_df)))


    pos_tagged_df2['group_c'] = np.sum([pos_dict['CC'],pos_dict['CD']], axis=0)
    pos_tagged_df2['group_j'] = np.sum([pos_dict['JJ'],pos_dict['JJR'], pos_dict['JJS']], axis=0)
    pos_tagged_df2['group_n'] = np.sum([pos_dict['NN'],pos_dict['NNS'],pos_dict['NNP'],pos_dict['NNPS']], axis=0)
    pos_tagged_df2['group_p'] = np.sum([pos_dict['PRP'],pos_dict['PRP$']], axis=0)
    pos_tagged_df2['group_r'] = np.sum([pos_dict['RBR'],pos_dict['RBS'],pos_dict['RB']], axis=0)
    pos_tagged_df2['group_v'] = np.sum([pos_dict['VB'],pos_dict['VBD'], pos_dict['VBG'], pos_dict['VBN'],  pos_dict['VBP'],  pos_dict['VBZ']], axis=0)
    pos_tagged_df2['group_w'] = np.sum([pos_dict['WDT'],pos_dict['WP'], pos_dict['WP$'], pos_dict['WRB']], axis=0)


    return pos_tagged_df2

In [None]:
dataframe = group_pos_features(pos_df,df)

### **6.cunting-statistic** features

In [None]:
#Returns the number of words with a syllable count greater than or equal to 3.
def polysyllab_count(text):
  return(textstat.polysyllabcount(text))

#Returns the number of words with a syllable count equal to one.
def monosyllab_count(text):
  return(textstat.monosyllabcount(text))

#Number of sentences
def count_sent(text):
  return(textstat.sentence_count(text))

#Number of capital words(all of char is upper)
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))


In [None]:
df['polysyllab_count'] = new_txt.apply(lambda x:polysyllab_count(x))
df['monosyllab_count'] = new_txt.apply(lambda x:monosyllab_count(x))
df['sent_count'] = new_txt.apply(lambda x:count_sent(x))
df['capital_word_count'] = new_txt.apply(lambda x:count_capital_words(x))

## **Learn model**

In [None]:
df.drop(['Id', 'newsText', 'prep_text'], axis = 1, inplace = True)

In [None]:
array = df.values
columns = df.columns
X = array[:,1:]
Y = array[:,0]
y = Y.astype('int')


data_scaler = StandardScaler().fit(X)
data_rescaled = data_scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape)


X_train_scl, X_test_scl, y_train_scl, y_test_scl = train_test_split(data_rescaled,Y, test_size=0.3, random_state=0)

print(X_train_scl.shape, X_test_scl.shape)

(13495, 20) (5784, 20)
(13495, 20) (5784, 20)


In [None]:
def evaluate(y_test, y_pred):

  score = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='binary')
  recall = recall_score(y_test, y_pred, average= 'binary')
  score_f1 = f1_score(y_test, y_pred, average='binary')

  print(f'Accuracy: {round(score*100,2)}%')
  print('precision: %.3f' % precision)
  print('Recall: %.3f' % recall)
  print('F-Measure: %.3f' % score_f1)

  cm = confusion_matrix(y_test, y_pred)
  report = classification_report(y_test, y_pred)
  print('confusion_matrix:','\n',cm,'\n')
  print('classification_report:','\n',report,'\n')

In [None]:
#1-------------------------------------------------------------
def RandomForest(tfidf_train,y_train,tfidf_test,y_test):
  print("RandomForestClassifier")
  classifier3 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  classifier3.fit(tfidf_train, y_train)
  y_predR = classifier3.predict(tfidf_test)
  evaluate(y_test, y_predR)

#2-------------------------------------------------------------
def PassiveAggressive(tfidf_train,y_train,tfidf_test,y_test):
  print("Passive Aggressive Classifier")
  pac=PassiveAggressiveClassifier(max_iter=50)
  pac.fit(tfidf_train,y_train)
  y_pred=pac.predict(tfidf_test)
  evaluate(y_test,y_pred)

#3-------------------------------------------------------------
def SVMclassifier(tfidf_train,y_train,tfidf_test,y_test):
  print("svm Classifier")
  SVM = SVC(C=1.9, kernel='linear')
  SVM.fit(tfidf_train, y_train)
  svm_predictions = SVM.predict(tfidf_test)
  evaluate(y_test, svm_predictions)

In [None]:
print('1.RandomForest')
RandomForest(X_train_scl,y_train_scl,X_test_scl,y_test_scl)

print('2.PassiveAggressive')
PassiveAggressive(X_train_scl,y_train_scl,X_test_scl,y_test_scl)

print('3.SVM')
SVMclassifier(X_train_scl,y_train_scl,X_test_scl,y_test_scl)