In [66]:
import csv
import pandas as pd
import random
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import re
import nltk
import string
from BeautifulSoup import BeautifulSoup
import urllib, urllib2
from nltk.util import ngrams

from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

from scipy.sparse import coo_matrix, hstack

In [106]:

class FeatureExtractor:
    @classmethod
    def get_shallow_POS(self, data):
        """
        Get shallow part of speech tags to identify the sentence's grammatical structure
        """
        pos_tags = []
        for sentence in data:
            pos_tags.append(Counter(nltk.pos_tag(sentence)))
        
        return pos_tags
    
    @classmethod
    def get_ngrams(self, data, n):
        """
        Get ngram feature vector which helps to get the context of the sentence 
        """
        ngram_vect = []
        for sentence in data:
            ngram_vect.append(ngrams(Utilities.stem_text(sentence), n))
        return ngram_vect
    
    @classmethod
    def get_tf_idf(self, data):
        """
        Get tf_idf which represents the characteristic words in fake and true news classes
        """
        data_tokens = Utilities.stem_text(data)
        tfidf = TfidfVectorizer(min_df=1)
        tfidf_vect = tfidf.fit(Counter(data_tokens)).transform(data)
        return tfidf_vect
    
    @classmethod
    def get_online_relevance_score(self, data):
        """
        Get online relevance score which is the jaccardian similarity between the given data item and bing search results
        """
        online_relevance_score = []
        for sentence in data:
            bing_results = Scraper.bing_search(sentence)
            similarity_sum = 0
            for i in range(len(bing_results)) :
                results_text = Utilities.stem_text(nltk.word_tokenize(bing_results[i]))
                similarity_sum += Algorithms.get_jaccard_similarity(sentence,results_text)
            similarity_sum = similarity_sum*1.0/(len(bing_results))
            online_relevance_score.append(similarity_sum)
        return online_relevance_score
        
    @classmethod    
    def get_feature_vectors(self, data):
        """
        Translates data to feature vectors
        """
        ngram_vector = FeatureExtractor.get_ngrams(data, n=2)
        tfidf_vector = FeatureExtractor.get_tf_idf(data)
        shallow_pos_vector = FeatureExtractor.get_shallow_POS(data)
        online_relevance_score_vector = FeatureExtractor.get_online_relevance_score(data)
        features = hstack([ngram_vector, tfidf_vector, shallow_pos_vector, online_relevance_score_vector])
        return features
    

In [100]:
class RandomForest():
    def __init__(self, nof_trees=10):
        self.model = RandomForestClassifier(n_estimators=nof_trees)
        
    def train(self, training_data, training_label):
        """
        Trains the RF model using the features defined under FeatureExtractor
        """
        training_feature_vector = FeatureExtractor.get_feature_vectors(training_data)
        self.model.fit(training_feature_vector, training_label) 
        return self.model
    
    def predict(self, test_data):
        """
        Predicts the output (fake or not) using the trained RF model 
        """
        test_feature_vector = FeatureExtractor.get_feature_vectors(test_data)
        predict_test = self.model.predict(test_data)
        return predict_test
    
    def get_overall_accuracy(self, prediction, test_label):
        """
        Computes the overall accuracy of the model
        """
        accuracy = 0
        for i in xrange(len(prediction)):
            if prediction[i] == test_label[i]:
                accuracy += 1
        return accuracy*1.0/len(prediction)
    
    def get_classification_accuracy(self, prediction, test_label):
        """
        Computes classification accuracy which explains the accuracy of the model for each class - Fake news and True news
        """
        TP = 0
        TN = 0
        POS = 0
        NEG = 0

        for i in xrange(len(prediction)):
            if prediction[i] == test_label[i] and prediction[i] == 1:
                TP += 1
            elif prediction[i] == test_label[i] and prediction[i] == 0:
                TN += 1
            if prediction[i] == 1:
                POS += 1
            if prediction[i] == 0:
                NEG += 1

        TPR = TP*1.0/(POS)
        TNR = TN*1.0/(NEG)

        class_accuracy = (TP+TN)*1.0/(len(prediction))
        return (TPR, TNR, class_accuracy)
    
    

In [101]:
class Utilities:
    @classmethod
    def is_ascii(self, s):
        """
        Checks if the string is ASCII or not
        """
        return all(ord(c) < 128 for c in s)
    
    @classmethod
    def read_file(self, file, label):
        data = []
        with open(file) as f:
            for line in f:
                data.append({"title":line, "label": label})
        return pd.DataFrame(data)
    
    @classmethod
    def read_CSV(self, file, delimit, label):
        df = pd.read_csv(file, delimiter=delimit, encoding="utf-8")
        df['label'] = label
        return df[["title", "label"]]
    
    @classmethod
    def stem_text(self, text):
        text = text.replace('\n','')
        for punc in string.punctuation:
            text = text.replace(punc, '')
        for num in "0123456789":
            text = text.replace(num, "")
        tokenized_text = nltk.word_tokenize(text)
        stemmed_text = [x.lower() for x in tokenized_text if x.lower() not in stopwords.words("english")]
        return stemmed_text
            
    
    

In [102]:

class DatasetProcessor:
    """
    Processes the true and fake news files and generate training and test data set as specified by "SPLIT"
    """
    SPLIT = 0.20
    
    @classmethod
    def get_cleaned_data(self, data, index):
        """
        Data cleaning to remove irrelevant data items for example,
        To remove all the float type data which gives no information on textual features
        """
        if type(data['title'].iloc[index]) != type(0.5) and Utilities.is_ascii(data['title'].iloc[index]):
            text = ''.join(k for k in data['title'].iloc[index] if not k.isdigit() and type(k) != type(0.5))
            if len(text) > 10:
                return text
        return ""
    
    @classmethod
    def get_dataset(self, true_news_file, fake_news_file):
        true_dataset = []
        fake_dataset = []
        
        for filename, is_csv in true_news_file:
            #True news label is 0
            if is_csv:
                true_dataset.append(Utilities.read_CSV(file=filename, delimit=',', label=0))
            else:
                true_dataset.append(Utilities.read_file(file=filename, label=0))

        for filename, is_csv in fake_news_file:
            #Fake news label is 1
            if is_csv:
                fake_dataset.append(Utilities.read_CSV(file=filename, delimit=',', label=1))
            else:
                fake_dataset.append(Utilities.read_file(file=filename, label=1))
        
        #Generate equal samples of both classes for training to avoid bias towards a particular class
        min_count_in_each_class = min(len(true_dataset), len(fake_dataset))
        dataset = [true_dataset[i] for i in sorted(random.sample(xrange(len(true_dataset)), min_count_in_each_class))]
        dataset += [fake_dataset[i] for i in sorted(random.sample(xrange(len(fake_dataset)), min_count_in_each_class))]
        
        data = pd.concat(dataset)
        data = data.sample(frac=1).reset_index(drop=True)

        training_data = []
        training_label = []
        test_data = []
        test_label = []


        for i in xrange(int(len(data)*DatasetProcessor.SPLIT)+1):
            text = self.get_cleaned_data(data, i)
            if text != "":
                test_data.append(text)
                test_label.append(data['label'].iloc[i])

        for i in xrange(int(len(data)*DatasetProcessor.SPLIT)+1, len(data)):
            text = self.get_cleaned_data(data, i)
            if text != "":
                training_data.append(text)
                training_label.append(data['label'].iloc[i])

        return (training_data, training_label, test_data, test_label)


In [103]:
class Algorithms:
    @classmethod
    def get_jaccard_similarity(self, text1, text2):
        intersection_cardinality = len(set.intersection(*[set(text1), set(text2)]))
        union_cardinality = len(set.union(*[set(text1), set(text2)]))
        return intersection_cardinality/float(union_cardinality)

In [104]:
class Scraper:
    @classmethod
    def bing_search(self, query):
        address = "http://www.bing.com/search?q=%s" % (urllib.quote_plus(query))
        request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
        urlfile = urllib2.urlopen(request)
        page = urlfile.read()
        soup = BeautifulSoup(page)
        data = []
        for li in soup.findAll('div', attrs={'class':'b_caption'}):
            p = li.find('p')
            while p.span:
                p.span.extract()
            while p.a:
                p.a.extract()
            while p.p:
                p.p.extract()
            p = str(p)
            p = str(p.decode('ascii',errors='ignore'))
            p = re.sub('<p>&nbsp;&#0183;&#32;' , '',p)
            p = re.sub('<strong>' , '',p)
            p = re.sub('</strong>' , '',p)
            p = re.sub('...</p>','',p)
            p = re.sub('<p>','',p)
            p = re.sub('&quot;' , '' ,p)
            data.append(p)
        return data

In [92]:
class SVM():
    def __init__(self):
        self.model = svm.SVC(gamma=0.001, C=100, kernel='rbf')
    
    def train_SVM(training_data, training_label):
        ngram_train = ngram_vect.fit(Counter(tokens_counter)).transform(training_data)
        tfidf_train = tfidf.fit(Counter(tokens_counter)).transform(training_data)
        X_train = hstack([ngram_train, tfidf_train])
        pos_train = pos_vect.fit(pos_counter).transform(training_data)
        X_train = hstack([X_train, pos_train])
        self.model.fit(X_train, training_label) 
        return self.model
    
    def predict(self, test_data):
        test_feature_vector = FeatureExtractor.get_feature_vectors(test_data)
        predict_test = self.model.predict(test_data)
        return predict_test
    
    def get_overall_accuracy(self, prediction, test_label):
        accuracy = 0
        for i in xrange(len(prediction)):
            if prediction[i] == test_label[i]:
                accuracy += 1
        return accuracy*1.0/len(prediction)
    
    

    


In [93]:
if __name__ == '__main__':
    #Add the files in the following format - (filename, True)  - if its CSV 
    #otherwise (filename, False)
    TRUE_NEWS = [("../data/true-headlines",False)]
    FAKE_NEWS = [("../data/fake.csv", True), ("../data/fake-news", False)]
    
    (training_data, training_label, test_data, test_label) = DatasetProcessor.get_dataset(TRUE_NEWS, FAKE_NEWS)
    
    


In [111]:
    count_1 = 0
    count_0 = 0
    
    for i in training_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Training label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    count_1 = 0
    count_0 = 0
    
    for i in test_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Test label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    
    

Training label: Count_0 =  39863  and count_1 =  32723
Test label: Count_0 =  8045  and count_1 =  10104


In [None]:
    rf = RandomForest()
    rf.train(training_data, training_label)
    prediction = rf.predict(test_data)

In [109]:
    TPR, TNR, class_accuracy = get_classification_accuracy(test_label, prediction)
    print "TPR = ", TPR, " TNR = ", TNR, " Classification accuracy = ", class_accuracy

TPR =  0.846013404206  TNR =  0.986572438163  Classification accuracy =  0.945835942392
