In [1]:
import re
import string
from pathlib import Path
from collections import defaultdict, Counter

from sklearn.ensemble import AdaBoostClassifier

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from functools import partial

import pandas as pd
import numpy as np

### 1. Load both the datasets

In [2]:
# Load the Urdu Fake News dataset
def read_UFN_dataset(files):
    text=[]
    for i,file_path in enumerate(files):
        with open(file_path,'r',encoding='utf8') as infile:
            cleantext= infile.read() # can do any preprocessing here!
            text.append(cleantext)
    return text

# call the function to load the real and fake news
UFN_real_news = read_UFN_dataset(sorted(Path('UFN/0').glob('*.txt')))
UFN_fake_news = read_UFN_dataset(sorted(Path('UFN/1').glob('*.txt')))

In [3]:
# Load the Bend the Truth (BET) dataset
def preprocessText(text):
    #here remove text
    cleantext=re.sub("\d+", "0", text)
    return cleantext

def read_txt_files(files):
    text=[]
    topic=[]
    for i,file_path in enumerate(files):
        with open(file_path, 'r', encoding="utf8") as infile:
            file_text= preprocessText(infile.read())
            file_topic=''.join(re.findall('[A-Za-z]', file_path.stem))
            topic.append(file_topic)
            text.append(file_text)
    return text, topic

In [4]:
# convert both the datasets to dataframes for easy processing

# UFN Dataset
real = [0] * len(UFN_real_news)
fake = [1] * len(UFN_fake_news)

d = {'text': UFN_real_news, 'label':  real }
df1 = pd.DataFrame(data=d)

d = {'text': UFN_fake_news, 'label': fake}
df2 = pd.DataFrame(data=d)

UFN_df = pd.concat([df1, df2])

UFN_df.head()

Unnamed: 0,text,label
0,آپ ہلیری کے خوف سے بو آسکتے ہیں - آزادی مرکز م...,0
1,ٹرمپ کی ریلی میں عین لمحے पल ریان نے سیاسی خود...,0
2,Over € nInfernoâ € ™ اور زیادہ آبادی کا افسانہ...,0
3,رحمت کا جوبلی سال 20 نومبر کو ختم ہوگا۔ فیصلے ...,0
4,کیا امریکہ کو قومی سلامتی کی ریاست معاف کرنا چ...,0


In [5]:
def len_text_no_punc(s):
    string_w = s.translate(str.maketrans('','',string.punctuation)) 
    return len(string_w)

In [6]:
# BET Dataset
train_real_news, train_real_news_topics = read_txt_files(sorted(Path('Corpus/Train/Real/').glob('*.txt')))
train_fake_news, train_fake_news_topics = read_txt_files(sorted(Path('Corpus/Train/Fake/').glob('*.txt')))

real = [1] * len(train_real_news_topics)
fake = [0] * len(train_fake_news_topics)

d = {'text': train_real_news, 'topic': train_real_news_topics,'label':  real }
df1 = pd.DataFrame(data=d)

d = {'text': train_fake_news, 'topic': train_fake_news_topics,'label': fake}
df2 = pd.DataFrame(data=d)

Train_data = pd.concat([df1,df2])

#load test data
real_news, real_news_topics = read_txt_files(sorted(Path('Corpus/Test/Real/').glob('*.txt')))
fake_news, fake_news_topics = read_txt_files(sorted(Path('Corpus/Test/Fake/').glob('*.txt')))

real = [1] * len(real_news_topics)
fake = [0] * len(fake_news_topics)

d = {'text': real_news, 'topic': real_news_topics,'label': real}
df1 = pd.DataFrame(data=d)
d = {'text': fake_news, 'topic': fake_news_topics,'label': fake}
df2 = pd.DataFrame(data=d)

Test_data = pd.concat([df1,df2])
df = pd.concat([Train_data,Test_data])

df['lengh_text'] = df['text'].apply(len_text_no_punc)
df = df.sort_values(by=['lengh_text'], ascending=True)
df = df.reset_index(drop=True)
df = df[df['lengh_text']>=6] # only keep text with len more than 6 because n char
df.head(10)

Unnamed: 0,text,topic,label,lengh_text
0,نئی دہلی 0 ڈسمبر(ایجنسی) عالمی بازار کے مثبت س...,bus,1,308
1,﻿\nآنجہانی برطانوی رائٹر کے Jodi Picoltناول ...,sbz,0,313
2,شیئر کریں:\n\nآنجہانی برطانوی رائٹرP. L. Trav...,sbz,1,326
3,﻿\nلوکا گواڈنینو کی پروڈکشن میں بننے والی فلم ...,sbz,0,332
4,\nنئی دہلی 0 ڈسمبر(ایجنسی) عالمی منڈی اور شادی...,bus,0,334
5,لاہور: سرفراز احمد بھی مفت کے ٹکٹ مانگنے والوں...,sp,1,338
6,آئی سی سی کا بڑا اعلان\nڈھاکہ 0 دسمبر (سیاست ڈ...,sp,0,363
7,ممبئی(شوبزڈ یسک) بالی وڈ کے معروف اداکار دھرمن...,sbz,1,372
8,مصر 0 دسمبر(ایجنسی) ایک اداکارہ کے مبینہ طور س...,sbz,1,380
9,\nمصر 0 دسمبر(ایجنسی) ایک اداکارہ کے بوائے فری...,sbz,0,382


In [7]:
df = pd.concat([df, pd.get_dummies(df['topic'])], axis=1)  
classe = df['label']
df = df.drop(["label","topic"], axis=1)
BET_df = pd.concat([df,classe], axis=1)
BET_df.head(10)

Unnamed: 0,text,lengh_text,bus,hlth,sbz,sp,tch,label
0,نئی دہلی 0 ڈسمبر(ایجنسی) عالمی بازار کے مثبت س...,308,1,0,0,0,0,1
1,﻿\nآنجہانی برطانوی رائٹر کے Jodi Picoltناول ...,313,0,0,1,0,0,0
2,شیئر کریں:\n\nآنجہانی برطانوی رائٹرP. L. Trav...,326,0,0,1,0,0,1
3,﻿\nلوکا گواڈنینو کی پروڈکشن میں بننے والی فلم ...,332,0,0,1,0,0,0
4,\nنئی دہلی 0 ڈسمبر(ایجنسی) عالمی منڈی اور شادی...,334,1,0,0,0,0,0
5,لاہور: سرفراز احمد بھی مفت کے ٹکٹ مانگنے والوں...,338,0,0,0,1,0,1
6,آئی سی سی کا بڑا اعلان\nڈھاکہ 0 دسمبر (سیاست ڈ...,363,0,0,0,1,0,0
7,ممبئی(شوبزڈ یسک) بالی وڈ کے معروف اداکار دھرمن...,372,0,0,1,0,0,1
8,مصر 0 دسمبر(ایجنسی) ایک اداکارہ کے مبینہ طور س...,380,0,0,1,0,0,1
9,\nمصر 0 دسمبر(ایجنسی) ایک اداکارہ کے بوائے فری...,382,0,0,1,0,0,0


In [8]:
x = df[['lengh_text']].values.astype(float)
min_max_scaler = preprocessing.StandardScaler()
df['lengh_text'] = min_max_scaler.fit_transform(x)
df.head(10)

Unnamed: 0,text,lengh_text,bus,hlth,sbz,sp,tch
0,نئی دہلی 0 ڈسمبر(ایجنسی) عالمی بازار کے مثبت س...,-1.057498,1,0,0,0,0
1,﻿\nآنجہانی برطانوی رائٹر کے Jodi Picoltناول ...,-1.053373,0,0,1,0,0
2,شیئر کریں:\n\nآنجہانی برطانوی رائٹرP. L. Trav...,-1.042645,0,0,1,0,0
3,﻿\nلوکا گواڈنینو کی پروڈکشن میں بننے والی فلم ...,-1.037694,0,0,1,0,0
4,\nنئی دہلی 0 ڈسمبر(ایجنسی) عالمی منڈی اور شادی...,-1.036044,1,0,0,0,0
5,لاہور: سرفراز احمد بھی مفت کے ٹکٹ مانگنے والوں...,-1.032743,0,0,0,1,0
6,آئی سی سی کا بڑا اعلان\nڈھاکہ 0 دسمبر (سیاست ڈ...,-1.012113,0,0,0,1,0
7,ممبئی(شوبزڈ یسک) بالی وڈ کے معروف اداکار دھرمن...,-1.004686,0,0,1,0,0
8,مصر 0 دسمبر(ایجنسی) ایک اداکارہ کے مبینہ طور س...,-0.998085,0,0,1,0,0
9,\nمصر 0 دسمبر(ایجنسی) ایک اداکارہ کے بوائے فری...,-0.996435,0,0,1,0,0


In [9]:
UFN_df.label.value_counts(), BET_df.label.value_counts()

(0    1032
 1     968
 Name: label, dtype: int64,
 1    500
 0    400
 Name: label, dtype: int64)

### 3. Split the datasets in train and test

In [10]:
X_trainU, X_testU, y_trainU, y_testU = train_test_split(
    UFN_df.loc[:, UFN_df.columns != 'label'], 
    UFN_df.loc[:, UFN_df.columns == 'label'], 
    test_size = 0.35, 
    random_state = 0) 

In [11]:
X_trainB, X_testB, y_trainB, y_testB = train_test_split(
    BET_df.loc[:, BET_df.columns != 'label'], 
    BET_df.loc[:, BET_df.columns == 'label'], 
    test_size = 0.35, 
    random_state = 0) 

In [12]:
X_trainB.shape, X_testB.shape, y_trainB.shape, y_testB.shape

((585, 7), (315, 7), (585, 1), (315, 1))

### 4. Define the Feature Extractor Class

In [13]:
class FeatureExtractor: 
    """Feature extraction"""
    
    def __init__(self, cnvalues, wnvalues, fnvalues): 
        self.cnvalues = cnvalues
        self.wnvalues = wnvalues
        self.fnvalues = fnvalues
        self.vectorizer = CountVectorizer(lowercase=False, min_df=2, tokenizer=lambda x: x.split('&%$')) #--> we can change this
    
    def process_texts(self,texts,cn,wn,fn):
        occurrences=defaultdict(int)
        featuresList=[]
        featuresDict=Counter()
        for (text) in texts:
            features=self.extract_features(text,cn,wn,fn)
            featuresDict.update(features)
            featuresList.append('&%$'.join(features))
        return featuresList, featuresDict
       
    def fit_extract(self, train_texts):
        train_features, dicOfFeatures = self.process_texts(train_texts, self.cnvalues, self.wnvalues, self.fnvalues)
        train_data = self.vectorizer.fit_transform(train_features)
        train_data = train_data.astype(float)
        return train_data, dicOfFeatures
    
    def extract(self, test_texts): 
        test_features, dicOfFeaturesTest = self.process_texts(test_texts, self.cnvalues, self.wnvalues, self.fnvalues)
        test_data = self.vectorizer.transform(test_features)
        test_data = test_data.astype(float)
        return test_data, dicOfFeaturesTest 
    
    def wordNgrams(self,text, n):
        ngrams = []
        text = [word for word in text.split() if word not in string.punctuation]
        ngrams = [' '.join(text[i:i+n])+'' for i in range(len(text)-n+1)]
        return ngrams
    
    def charNgrams(self,text, n):
        ngrams = []
        ngrams = [text[i:i+n] for i in range(len(text)-n+1)]
        return ngrams
    #Extracts function words n-grams with a pre-loaded dictionary
    def funcNgrams(self,text, n):
        stop_words = self.load_diccionario('stop_words.txt')
        patt=r'\b(' + ('|'.join(re.escape(key) for key in stop_words)).lstrip('|') + r')\b'
        pattern = re.compile(patt)
        text = re.sub(r"(\n+|\r+|(\r\n)+)", " ", text)
        text = re.sub(r" +", " ", text)
        text = re.sub(r"’", "'", text)
        text = re.sub(r"[" + string.punctuation + "]*", "", text)
        terms = pattern.findall(text)
        n_grams=[('_'.join(terms[i:i+n])) + "_fwn" for i in range(len(terms)-n+1)]
        return n_grams
    
    def extract_features(self,text,cn,wn,fn):
        text = text.lower()
        features = []
        if(cn>0):
            features.extend(self.charNgrams(text,cn))
        if(wn>0):
            features.extend(self.wordNgrams(text,wn))
        if(fn>0):
            features.extend(self.funcNgrams(text,fn))
        return features
    
    # Extracts all features in a set of 'texts' and return as a string separated with the simbol '&%$'

    def load_diccionario(self,ruta):
        terms = set()#Dictionary of slangs
        try:
            tmp = open(ruta, "r", encoding='utf8')     
            while True :
                linea = tmp.readline()                                                                                   
                #linea = to_unicode(linea) 
                if (not linea) or (linea == ""):                                                                               
                    break;                                                                                                      
                linea = linea.rstrip()
                terms.add(linea.lower())
            return (terms)
        except IOError as e:
            print ("Error: "+ruta+" I/O error({0}): {1}".format(e.errno, e.strerror))
            exit(1)
            
    def apply_frequency_threshold(self,feature_mx, N=5):
        values=np.array(feature_mx.sum(axis=0)).ravel()
        thresholdMask=(values >= N)*1
        indices_zero = list(np.nonzero(thresholdMask == 0)[0])
        all_cols = np.arange(feature_mx.shape[1])
        cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, indices_zero)))[0]
        return cols_to_keep 

### 5. Define weighting schemes

In [14]:
def get_weighting_scheme(feature_weight_name): 
    
    if feature_weight_name == 'binary':
        #print ("feature_weight = binary")
        return preprocessing.Binarizer()
        
    elif feature_weight_name == 'tfidf':
        #print ("feature_weight = tfidf")
        return TfidfTransformer()

### 6. Feature Extraction

In [15]:
feat_extractor = FeatureExtractor(2, 1, 2)  # char-ngrams, word-ngrams, functional-ngrams

def get_features(df_train, df_test):
    train_feature, _ = feat_extractor.fit_extract(df_train['text'])
    test_feature, _ = feat_extractor.extract(df_test['text']) 

    #applying the frequency threshold
    cols_to_keep = feat_extractor.apply_frequency_threshold(train_feature)
    thresholded_train_f = train_feature[:, cols_to_keep]
    thresholded_test_f = test_feature[:, cols_to_keep]
    
    return thresholded_train_f, thresholded_test_f

### 7. Results for Binary features and feature selection on them for UFN Dataset

In [16]:
train_X, test_X = get_features(X_trainU, X_testU)
train_y, test_y = y_trainU.values.reshape(-1), y_testU.values.reshape(-1)



In [17]:
transformer = get_weighting_scheme('binary')

In [18]:
weighted_train_X = transformer.fit_transform(train_X)
weighted_test_X = transformer.transform(test_X)

In [19]:
clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
clfAB.fit(weighted_train_X, train_y)

In [20]:
y_hat = clfAB.predict(weighted_test_X)
print('Acc:', accuracy_score(test_y, y_hat))

Acc: 0.89


In [21]:
weighted_train_X.shape, train_y.shape

((1300, 22121), (1300,))

In [22]:
from feature_selection import FeatureSelection

fs = FeatureSelection()

fs_metrics = [fs.acc2, fs.ndm, fs.bns, fs.odds_ratio, fs.gini, fs.dfs, fs.IG, fs.ChiSquare ]
fs_metric_names = [x.__name__ for x in fs_metrics]

results_df = pd.DataFrame(columns=fs_metric_names) 

no_of_terms = [100, 200, 300, 400, 500, 1000, 1500, 2000]

for k_val in no_of_terms:
    row = []
    for metric in fs_metrics:
        selector = SelectKBest(score_func=partial(metric), k=k_val)
        subset_train_X = selector.fit_transform(weighted_train_X, train_y)
        subset_test_X = selector.transform(weighted_test_X)
        clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
        clfAB.fit(subset_train_X, train_y)
        y_hat = clfAB.predict(subset_test_X)
        row.append(accuracy_score(test_y, y_hat))
    
    results_df.loc[len(results_df),] = row
    print('Done for features: ', k_val)

  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  100


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  200


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  300


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  400


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  500


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  1000


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  1500


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  2000


In [23]:
results_df

Unnamed: 0,acc2,ndm,bns,odds_ratio,gini,dfs,IG,ChiSquare
0,0.805714,0.778571,0.768571,0.631429,0.547143,0.851429,0.837143,0.835714
1,0.842857,0.778571,0.802857,0.652857,0.63,0.865714,0.86,0.845714
2,0.854286,0.8,0.831429,0.644286,0.715714,0.865714,0.865714,0.864286
3,0.845714,0.802857,0.835714,0.665714,0.757143,0.862857,0.864286,0.865714
4,0.861429,0.812857,0.851429,0.668571,0.807143,0.88,0.868571,0.871429
5,0.877143,0.854286,0.858571,0.725714,0.84,0.888571,0.887143,0.884286
6,0.888571,0.858571,0.87,0.741429,0.865714,0.888571,0.881429,0.888571
7,0.895714,0.851429,0.875714,0.751429,0.88,0.878571,0.882857,0.882857


In [24]:
results_df.describe()

Unnamed: 0,acc2,ndm,bns,odds_ratio,gini,dfs,IG,ChiSquare
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
unique,8.0,7.0,8.0,8.0,8.0,6.0,8.0,8.0
top,0.805714,0.778571,0.768571,0.631429,0.547143,0.865714,0.837143,0.835714
freq,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0


### 8. Results for Binary Features and Feature Selection for BET Dataset

In [25]:
train_X, test_X = get_features(X_trainB, X_testB)
train_y, test_y = y_trainB.values.reshape(-1), y_testB.values.reshape(-1)
train_X.shape
transformer = get_weighting_scheme('binary')

weighted_train_X = transformer.fit_transform(train_X)
weighted_test_X = transformer.transform(test_X)



In [26]:
# perform classification
clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
clfAB.fit(weighted_train_X, train_y)

y_hat = clfAB.predict(weighted_test_X)
print('Acc:', accuracy_score(test_y, y_hat))

Acc: 0.8571428571428571


In [27]:
from feature_selection import FeatureSelection

fs = FeatureSelection()

fs_metrics = [fs.acc2, fs.ndm, fs.bns, fs.odds_ratio, fs.gini, fs.dfs, fs.IG, fs.ChiSquare ]
fs_metric_names = [x.__name__ for x in fs_metrics]

results_df1 = pd.DataFrame(columns=fs_metric_names)

no_of_terms = [100, 200, 300, 400, 500, 1000, 1500, 2000]

for k_val in no_of_terms:
    row = []
    for metric in fs_metrics:
        selector = SelectKBest(score_func=partial(metric), k=k_val)
        subset_train_X = selector.fit_transform(weighted_train_X, train_y)
        subset_test_X = selector.transform(weighted_test_X)
        clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
        clfAB.fit(subset_train_X, train_y)
        y_hat = clfAB.predict(subset_test_X)
        row.append(accuracy_score(test_y, y_hat))
    
    results_df1.loc[len(results_df1),] = row
    print('Done for features: ', k_val)

  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  100


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  200


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  300


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  400


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  500


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  1000


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  1500


  tn = self.tn / (self.tn + self.fn)
  fn = self.fn / (self.tn + self.fn)
  return (self.tp * self.tn - self.fn * self.fp)**2 / den


Done for features:  2000


In [28]:
results_df1

Unnamed: 0,acc2,ndm,bns,odds_ratio,gini,dfs,IG,ChiSquare
0,0.892063,0.692063,0.847619,0.742857,0.507937,0.87619,0.879365,0.87619
1,0.873016,0.809524,0.777778,0.8,0.803175,0.860317,0.84127,0.847619
2,0.88254,0.819048,0.755556,0.761905,0.812698,0.853968,0.850794,0.850794
3,0.873016,0.831746,0.730159,0.784127,0.838095,0.866667,0.831746,0.863492
4,0.88254,0.847619,0.765079,0.787302,0.863492,0.866667,0.869841,0.873016
5,0.87619,0.84127,0.853968,0.822222,0.863492,0.869841,0.869841,0.869841
6,0.860317,0.847619,0.853968,0.834921,0.866667,0.847619,0.873016,0.857143
7,0.863492,0.84127,0.84127,0.838095,0.850794,0.853968,0.869841,0.869841


In [29]:
results_df1.describe()

Unnamed: 0,acc2,ndm,bns,odds_ratio,gini,dfs,IG,ChiSquare
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
unique,6.0,6.0,7.0,8.0,7.0,6.0,6.0,7.0
top,0.873016,0.847619,0.853968,0.742857,0.863492,0.853968,0.869841,0.869841
freq,2.0,2.0,2.0,1.0,2.0,2.0,3.0,2.0


### 9. Results on the Proposed Approach

In [30]:
def effect_size_score(X, y):
    
        classes = list(np.unique(y))

        positive = X[y==classes[0]].copy()
        negative = X[y==classes[1]].copy()

        positive = positive.toarray()
        negative = negative.toarray()
        
        pos_mean = positive.mean(axis=0)
        neg_mean = negative.mean(axis=0)
        
        pos_std = positive.std(axis=0)
        neg_std = negative.std(axis=0)

        es_val = np.abs(pos_mean - neg_mean) / (pos_std + neg_std)
        
        return es_val

In [31]:
# Urdu Fake News Dataset
train_X, test_X = get_features(X_trainU, X_testU)
train_y, test_y = y_trainU.values.reshape(-1), y_testU.values.reshape(-1)
transformer = get_weighting_scheme('tfidf')
weighted_train_X = transformer.fit_transform(train_X)
weighted_test_X = transformer.transform(test_X)



In [32]:
# perform classification
clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
clfAB.fit(weighted_train_X, train_y)

y_hat = clfAB.predict(weighted_test_X)
print('Acc:', accuracy_score(test_y, y_hat))

Acc: 0.8928571428571429


In [33]:
# perform classification

no_of_terms = [100, 200, 300, 400, 500, 1000, 1500, 2000]

for k_val in no_of_terms:
    selector = SelectKBest(score_func=partial(effect_size_score), k=k_val)
    subset_train_X = selector.fit_transform(weighted_train_X, train_y)
    subset_test_X = selector.transform(weighted_test_X)

    clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
    clfAB.fit(subset_train_X, train_y)

    y_hat = clfAB.predict(subset_test_X)
    print('K Val:', k_val, 'Acc:', accuracy_score(test_y, y_hat))

K Val: 100 Acc: 0.8785714285714286
K Val: 200 Acc: 0.8942857142857142
K Val: 300 Acc: 0.9014285714285715
K Val: 400 Acc: 0.8857142857142857
K Val: 500 Acc: 0.8928571428571429
K Val: 1000 Acc: 0.8914285714285715
K Val: 1500 Acc: 0.8885714285714286
K Val: 2000 Acc: 0.8885714285714286


In [34]:
# Bend the Truth Dataset
train_X, test_X = get_features(X_trainB, X_testB)
train_y, test_y = y_trainB.values.reshape(-1), y_testB.values.reshape(-1)
transformer = get_weighting_scheme('tfidf')
weighted_train_X = transformer.fit_transform(train_X)
weighted_test_X = transformer.transform(test_X)



In [35]:
# perform classification
clfAB = AdaBoostClassifier(n_estimators=300, learning_rate=.1,  random_state=0)
clfAB.fit(weighted_train_X, train_y)

y_hat = clfAB.predict(weighted_test_X)
print('Acc:', accuracy_score(test_y, y_hat))

Acc: 0.8634920634920635


In [36]:
no_of_terms = [100, 200, 300, 400, 500, 1000, 1500, 2000, 2500, 3000]

for k_val in no_of_terms:
    selector = SelectKBest(score_func=partial(effect_size_score), k=k_val)
    subset_train_X = selector.fit_transform(weighted_train_X, train_y)
    subset_test_X = selector.transform(weighted_test_X)

    clfAB = AdaBoostClassifier(n_estimators=200, learning_rate=.1,  random_state=0)
    clfAB.fit(subset_train_X, train_y)

    y_hat = clfAB.predict(subset_test_X)
    print('K Val:', k_val, 'Acc:', accuracy_score(test_y, y_hat))

K Val: 100 Acc: 0.873015873015873
K Val: 200 Acc: 0.8793650793650793
K Val: 300 Acc: 0.8603174603174604
K Val: 400 Acc: 0.8634920634920635
K Val: 500 Acc: 0.8698412698412699
K Val: 1000 Acc: 0.8507936507936508
K Val: 1500 Acc: 0.8539682539682539
K Val: 2000 Acc: 0.8507936507936508
K Val: 2500 Acc: 0.8634920634920635
K Val: 3000 Acc: 0.8698412698412699
