In [104]:
import csv
import pandas as pd
import random
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
import codecs



In [105]:
SPLIT = 0.20

In [106]:
def extract_lines(file, label):
    data = []
    with open(file) as f:
        for line in f:
            #Legitimate news label is 0
            data.append({"text":line, "label": label})
    return pd.DataFrame(data)



In [107]:
def extract_CSV(file, delimit, label):
    df = pd.read_csv(file, delimiter=delimit, encoding="utf-8")
    #Fake News label is 1
    df['label'] = label
    return df[["text", "label"]]
    

In [108]:
class DataSet:
    SPLIT = 0.20
    def __init__(self, fake_news_file, true_news_file, true_news_file2, fake_news_file2):
        self.data = pd.concat([extract_CSV(fake_news_file, ',', 1), extract_lines(true_news_file, 0), 
                               extract_lines(true_news_file2, 0), extract_lines(fake_news_file2, 1)])
        
        self.data = self.data.sample(frac=1).reset_index(drop=True)
        self.training_data = []
        self.training_label = []
        self.test_data = []
        self.test_label = []
    
    def split_dataset(self):
        """
        Remove all irregularities in the data set and split to training and test data
        """
        print "length = ", len(self.data)
        for i in xrange(int(len(self.data)*DataSet.SPLIT)+1):
            # To remove all the float type data which gives no information on textual features
            if type(self.data['text'].iloc[i]) != type(0.5):
                text = ''.join(k for k in self.data['text'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
                if len(text) > 10:
                    self.test_data.append(text)
                    self.test_label.append(self.data['label'].iloc[i])
        
        for i in xrange(int(len(self.data)*DataSet.SPLIT)+1, len(self.data)):
            # To remove all the float type data which gives no information on textual features
            if type(self.data['text'].iloc[i]) != type(0.5):
                text = ''.join(k for k in self.data['text'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
                if len(text) > 10: 
                    self.training_data.append(text)
                    self.training_label.append(self.data['label'].iloc[i])

        

In [109]:
def get_dataset(fake_news_file, true_news_file, true_news_file2, fake_news_file2):
    #data = pd.concat([extract_CSV(fake_news_file, ',', 1), extract_lines(true_news_file, 0), 
    #                           extract_lines(true_news_file2, 0), extract_lines(fake_news_file2, 1)])
    data = pd.concat([extract_lines(true_news_file2, 0), extract_lines(fake_news_file2, 1)])
    data = data.sample(frac=1).reset_index(drop=True)
    
    training_data = []
    training_label = []
    test_data = []
    test_label = []
    
    
    for i in xrange(int(len(data)*SPLIT)+1):
        # To remove all the float type data which gives no information on textual features
        if type(data['text'].iloc[i]) != type(0.5):
            text = ''.join(k for k in data['text'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
            if len(text) > 10:
                test_data.append(text)
                test_label.append(data['label'].iloc[i])
        
    for i in xrange(int(len(data)*SPLIT)+1, len(data)):
        # To remove all the float type data which gives no information on textual features
        if type(data['text'].iloc[i]) != type(0.5):
            text = ''.join(k for k in data['text'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
            if len(text) > 10: 
                training_data.append(text)
                training_label.append(data['label'].iloc[i])
    
    return (training_data, training_label, test_data, test_label)





In [121]:
def train_SVM(count_vect, training_data, training_label):
    X_train = count_vect.fit_transform(training_data)
    svm_instance = svm.SVC(gamma=0.001, C=100)
    svm_instance.fit(X_train, training_label) 
    return svm_instance



In [122]:
def test_SVM(count_vect, clf, test_data):
    X_test = count_vect.transform(test_data) 
    predict_test = clf.predict(X_test)
    return predict_test

In [128]:
def accuracy(predict, test_label):
    accuracy = 0
    for i in xrange(len(predict)):
        if predict[i] == test_label[i]:
            accuracy += 1
    return accuracy*1.0/len(predict)

In [138]:
def get_classification_accuracy(predict, test_label):
    TP = 0
    TN = 0
    
    for i in xrange(len(predict)):
        if predict[i] == test_label[i] and predict[i] == 1:
            TP += 1
        elif predict[i] == test_label[i] and predict[i] == 0:
            TN += 1
    TPR = TP*1.0/(TP+TN)
    TNR = TN*1.0/(TP+TN)
    
    class_accuracy = (TP+TN)*1.0/(len(predict))
    return (TPR, TNR, class_accuracy)



In [124]:
class SVM:
    def __init__(self, dataset):
        self.X_train = []
        self.X_test = []
        self.dataset = dataset
        self.count_vect = CountVectorizer(min_df=1)
        
    def train(self):   
        self.X_train = self.count_vect.fit_transform(self.dataset.training_data)
        clf = svm.SVC(gamma=0.001, C=100)
        clf.fit(self.X_train, self.dataset.training_label) 
    
    def test(self):
        self.X_test = count_vect.transform(self.dataset.test_data)
        #self.X_test = count_vect.transform(["Bala is a sore loser who sucks at playing Flappy Bird but still better than Kumaran"]) 
        predict_test = clf.predict(self.X_test)
        return predict_test

    def accuracy(self, predict):
        accuracy = 0
        for i in xrange(len(predict)):
            if predict[i] == self.dataset.test_label[i]:
                accuracy += 1
        return accuracy*1.0/len(predict)

    

In [125]:
if __name__ == '__main__':
    FAKE_NEWS_FILE = "../data/fake.csv"
    TRUE_NEWS_FILE = "../data/trueCorpora.txt"
    TRUE_NEWS_FILE2 = "../data/real-news"
    FAKE_NEWS_FILE2 = "../data/fake-news"
    
    (training_data, training_label, test_data, test_label) = get_dataset(FAKE_NEWS_FILE, TRUE_NEWS_FILE, 
                                                                         TRUE_NEWS_FILE2, FAKE_NEWS_FILE2)
    


In [126]:
    #Bag of words and SVM
    count_vect = CountVectorizer(min_df=1)
    clf = train_SVM(count_vect, training_data, training_label)
    

In [132]:
    count_1 = 0
    count_0 = 0
    
    for i in training_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Training label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    count_1 = 0
    count_0 = 0
    
    for i in test_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Test label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    
    

Training label: Count_0 =  9206  and count_1 =  10879
Test label: Count_0 =  2319  and count_1 =  2707


In [154]:
    prediction = test_SVM(count_vect, clf, [""])
    print prediction
    
    

[1]


In [None]:
    count_1 = 0
    count_0 = 0
    
    for i in prediction:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    
    print "Prediction label: Count_0 = ", count_0, " and count_1 = ", count_1

In [139]:
    TPR, TNR, class_accuracy = get_classification_accuracy(test_label, prediction)
    print "TPR = ", TPR, " TNR = ", TNR, " Classification accuracy = ", class_accuracy

TPR =  0.553359683794  TNR =  0.446640316206  Classification accuracy =  0.906088340629
