In [226]:
import csv
import pandas as pd
import random
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import re
import nltk
import string

from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

from scipy.sparse import coo_matrix, hstack

In [227]:
SPLIT = 0.20

In [228]:
def extract_lines(file, label):
    data = []
    with open(file) as f:
        for line in f:
            #Legitimate news label is 0
            data.append({"title":line, "label": label})
    return pd.DataFrame(data)



In [229]:
def extract_CSV(file, delimit, label):
    df = pd.read_csv(file, delimiter=delimit, encoding="utf-8")
    #Fake News label is 1
    df['label'] = label
    return df[["title", "label"]]
    

In [230]:
def get_tokens(training_data):
    tokens = []
    for sentence in training_data:
        for word in sentence.split(' '):
            lowers = word.lower()
            lowers = lowers.replace('\n','')
            for punc in string.punctuation:
                lowers = lowers.replace(punc, '')
            for num in "0123456789":
                lowers = lowers.replace(num, "")
            if lowers != " " and lowers != "" and lowers not in stopwords.words('english'):
                if is_ascii(lowers): 
                    tokens.append(str(lowers))
            
    return tokens

In [231]:
def get_shallow_POS(training_data):
    counter_list = []
    for sentence in training_data:
        counter_list.append(Counter([k if k not in string.punctuation else "PUNCT" for k in [j for i,j in pos_tag(word_tokenize(text))]]))

    return sum(counter_list, Counter())

In [232]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def get_dataset(fake_news_file, true_news_file, true_news_file2, fake_news_file2):
    #data = pd.concat([extract_CSV(fake_news_file, ',', 1), extract_lines(true_news_file, 0), 
    #                           extract_lines(true_news_file2, 0), extract_lines(fake_news_file2, 1)
    #                 ])
    data = pd.concat([extract_CSV(fake_news_file, ',', 1), extract_lines(true_news_file, 0), 
                              extract_lines(fake_news_file2, 1)
                     ])
    #data = pd.concat([extract_lines(true_news_file2, 0), extract_lines(fake_news_file2, 1)])
    data = data.sample(frac=1).reset_index(drop=True)
    
    training_data = []
    training_label = []
    test_data = []
    test_label = []
    
    
    for i in xrange(int(len(data)*SPLIT)+1):
        # To remove all the float type data which gives no information on textual features
        if type(data['title'].iloc[i]) != type(0.5) and is_ascii(data['title'].iloc[i]):
            text = ''.join(k for k in data['title'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
            if len(text) > 10:
                test_data.append(text)
                test_label.append(data['label'].iloc[i])
        
    for i in xrange(int(len(data)*SPLIT)+1, len(data)):
        # To remove all the float type data which gives no information on textual features
        if type(data['title'].iloc[i]) != type(0.5) and is_ascii(data['title'].iloc[i]):
            text = ''.join(k for k in data['title'].iloc[i] if not k.isdigit() and type(k) != type(0.5))
            if len(text) > 10: 
                training_data.append(text)
                training_label.append(data['label'].iloc[i])
    
    return (training_data, training_label, test_data, test_label)





In [233]:
def train_SVM(ngram_vect, tokens_counter, tfidf, pos_vect, pos_counter, training_data, training_label):
    
    ngram_train = ngram_vect.fit(Counter(tokens_counter)).transform(training_data)
    tfidf_train = tfidf.fit(Counter(tokens_counter)).transform(training_data)
    X_train = hstack([ngram_train, tfidf_train])
    pos_train = pos_vect.fit(pos_counter).transform(training_data)
    X_train = hstack([X_train, pos_train])
    svm_instance = svm.SVC(gamma=0.001, C=100)
    svm_instance.fit(X_train, training_label) 
    return svm_instance



In [234]:
def test_SVM(ngram_vect, tfidf, pos_vect, clf, test_data):
    
    ngram_test = ngram_vect.transform(test_data)
    tfidf_test = tfidf.transform(test_data)
    X_test = hstack([ngram_test, tfidf_test])
    pos_test = pos_vect.transform(test_data)
    X_test = hstack([X_test, pos_test])
    predict_test = clf.predict(X_test)
    return predict_test

In [235]:
def accuracy(predict, test_label):
    accuracy = 0
    for i in xrange(len(predict)):
        if predict[i] == test_label[i]:
            accuracy += 1
    return accuracy*1.0/len(predict)

In [236]:
def get_classification_accuracy(predict, test_label):
    TP = 0
    TN = 0
    POS = 0
    NEG = 0
    
    for i in xrange(len(predict)):
        if predict[i] == test_label[i] and predict[i] == 1:
            TP += 1
        elif predict[i] == test_label[i] and predict[i] == 0:
            TN += 1
        if predict[i] == 1:
            POS += 1
        if predict[i] == 0:
            NEG += 1
            
    TPR = TP*1.0/(POS)
    TNR = TN*1.0/(NEG)
    
    class_accuracy = (TP+TN)*1.0/(len(predict))
    return (TPR, TNR, class_accuracy)



In [103]:
if __name__ == '__main__':
    FAKE_NEWS_FILE = "../data/fake.csv"
    #TRUE_NEWS_FILE = "../data/trueCorpora.txt"
    TRUE_NEWS_FILE2 = "../data/real-news"
    TRUE_NEWS_FILE = "../data/true-headlines"
    FAKE_NEWS_FILE2 = "../data/fake-news"
    
    (training_data, training_label, test_data, test_label) = get_dataset(FAKE_NEWS_FILE, TRUE_NEWS_FILE, 
                                                                         TRUE_NEWS_FILE2, FAKE_NEWS_FILE2, 
                                                                         )
    


In [237]:
    #Bag of words and SVM
    tokens_counter = get_tokens(training_data)
    pos_counter = get_shallow_POS(training_data)
    ngram_vect = CountVectorizer(min_df=1)
    tfidf = TfidfVectorizer(min_df=1)
    pos_vect = CountVectorizer(min_df=1)
    clf = train_SVM(ngram_vect, tokens_counter, tfidf, pos_vect, pos_counter, training_data, training_label)
    

In [238]:
    count_1 = 0
    count_0 = 0
    
    for i in training_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Training label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    count_1 = 0
    count_0 = 0
    
    for i in test_label:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    print "Test label: Count_0 = ", count_0, " and count_1 = ", count_1
    
    
    

Training label: Count_0 =  84771  and count_1 =  17235
Test label: Count_0 =  21298  and count_1 =  4282


In [239]:
    prediction = test_SVM(ngram_vect, tfidf, pos_vect, clf, test_data)
    
    
    

In [240]:
    count_1 = 0
    count_0 = 0
    
    for i in prediction:
        if i == 1:
            count_1 += 1
        else:
            count_0 += 1
    
    print "Prediction label: Count_0 = ", count_0, " and count_1 = ", count_1

Prediction label: Count_0 =  21857  and count_1 =  3723


In [242]:
    TPR, TNR, class_accuracy = get_classification_accuracy(test_label, prediction)
    print "TPR = ", TPR, " TNR = ", TNR, " Classification accuracy = ", class_accuracy
    print "F1 score = ",f1_score(test_label, prediction)

TPR =  0.807566557683  TNR =  0.987557517138  Classification accuracy =  0.957427677873
F1 score =  0.863960024984


In [20]:
lines = []
with open('../data/newsCorpora.csv') as f:
    lines = f.readlines()

In [None]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

In [None]:
headlines = []
for line in lines:
    parts = line.split(',')
    if len(parts) >= 2:
        h = normalize_text(parts[1])
        if len(h.split()) >= 10 and '...' not in h:
            headlines.append(str(h.decode('ascii',errors='ignore')))

In [None]:
f = file('../data/true-headlines' , 'w')
for h in headlines:
    f.write(h)
    f.write('\n')

In [None]:
headlines[100]

In [70]:


def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def get_tokens(training_data):
    tokens = []
    for sentence in training_data:
        for word in sentence.split(' '):
            lowers = word.lower()
            lowers = lowers.replace('\n','')
            for punc in string.punctuation:
                lowers = lowers.replace(punc, '')
            for num in "0123456789":
                lowers = lowers.replace(num, "")
            if lowers != " " and lowers != "" and lowers not in stopwords.words('english'):
                if is_ascii(lowers): 
                    tokens.append(str(lowers))
            
    return tokens

tokens = get_tokens(training_data)
count2 = Counter(tokens)
print count2.most_common(20)


[('new', 7757), ('us', 5359), ('video', 3609), ('first', 2775), ('google', 2575), ('one', 2463), ('apple', 2134), ('may', 2124), ('says', 2103), ('galaxy', 2036), ('samsung', 2028), ('watch', 2000), ('day', 1908), ('game', 1789), ('season', 1737), ('star', 1723), ('report', 1721), ('time', 1691), ('get', 1670), ('million', 1660)]


In [71]:
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(tokens, stemmer)



In [89]:
test_count_vect = CountVectorizer(min_df=1)
test_count_vect.fit(count2)
arr = test_count_vect.transform(training_data)
print arr[0]
print training_data[0]
print test_count_vect.get_feature_names()[5238]
print test_count_vect.get_feature_names()[18677]
print test_count_vect.get_feature_names()[23355]
print test_count_vect.get_feature_names()[30793]
print test_count_vect.get_feature_names()[35858]
print test_count_vect.get_feature_names()[38200]
print test_count_vect.get_feature_names()[39992]

  (0, 5238)	1
  (0, 18677)	1
  (0, 23355)	1
  (0, 30793)	1
  (0, 35858)	1
  (0, 38200)	1
  (0, 39992)	1
JUST IN: Republicans Sued Over Trump’s Call To Intimidate Minority Voters
call
intimidate
minority
republicans
sued
trump
voters


Counter({'NN': 34, 'JJ': 12, 'NNS': 7, 'IN': 7, 'VB': 5, 'DT': 4, 'CC': 4, 'VBG': 3, 'PUNCT': 3, 'TO': 3, 'VBN': 2, 'VBZ': 2, 'POS': 2, 'PRP': 2, 'NNP': 2, 'VBD': 1, 'VBP': 1, 'RP': 1, 'RB': 1, 'CD': 1, 'MD': 1, 'JJS': 1})


In [114]:
tf_idf_matrix = tfidf.transform(X_test)

In [116]:
print tf_idf_matrix[0]

  (0, 39630)	0.434799999722
  (0, 35772)	0.404913551219
  (0, 28808)	0.389363320976
  (0, 19413)	0.33061407499
  (0, 12528)	0.450020335376
  (0, 5863)	0.428446276358


In [167]:
vect = CountVectorizer(min_df=1, ngram_range=(1,3))
X_train = vect.fit(count)
X_train2 = vect.transform(training_data[0:10])
print X_train2.shape

(10, 40107)


In [168]:
X_train3 = pipe.fit(count).transform(training_data[0:10])
print X_train3.shape

(10, 40107)


In [174]:

new_matrix = hstack([X_train2, X_train3])
print new_matrix.getrow(0)

  (0, 78912)	0.408248290464
  (0, 74410)	0.408248290464
  (0, 63784)	0.408248290464
  (0, 60369)	0.408248290464
  (0, 58217)	0.408248290464
  (0, 51375)	0.408248290464
  (0, 38805)	1.0
  (0, 34303)	1.0
  (0, 23677)	1.0
  (0, 20262)	1.0
  (0, 18110)	1.0
  (0, 11268)	1.0


In [215]:
print Counter(tokens_counter)




In [220]:
ngram_train = ngram_vect.fit(tokens_counter).transform(training_data[0:10])
print ngram_train


  (0, 11268)	1
  (0, 18110)	1
  (0, 20262)	1
  (0, 23677)	1
  (0, 34303)	1
  (0, 38805)	1
  (1, 12415)	1
  (1, 26150)	1
  (1, 26752)	1
  (1, 31077)	1
  (1, 34498)	1
  (1, 38302)	1
  (1, 38574)	1
  (1, 39519)	1
  (2, 9400)	1
  (2, 18689)	1
  (2, 19129)	1
  (2, 33683)	1
  (2, 34830)	1
  (2, 34886)	1
  (2, 35158)	1
  (2, 36310)	1
  (2, 37777)	1
  (3, 2316)	1
  (3, 6838)	1
  :	:
  (6, 36043)	1
  (7, 7384)	1
  (7, 8769)	1
  (7, 20225)	1
  (7, 33899)	1
  (8, 312)	1
  (8, 1601)	1
  (8, 1649)	1
  (8, 3285)	1
  (8, 14457)	1
  (8, 18110)	1
  (8, 23854)	1
  (8, 26671)	1
  (8, 32219)	1
  (8, 38201)	1
  (9, 11144)	1
  (9, 14318)	1
  (9, 21680)	1
  (9, 25749)	1
  (9, 28785)	1
  (9, 35841)	1
  (9, 36571)	1
  (9, 37013)	1
  (9, 37821)	1
  (9, 39829)	1


In [225]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=1)
tfidf_train = tfidf.fit(Counter(tokens_counter)).transform(training_data[0:10])
print tfidf_train

  (0, 38805)	0.408248290464
  (0, 34303)	0.408248290464
  (0, 23677)	0.408248290464
  (0, 20262)	0.408248290464
  (0, 18110)	0.408248290464
  (0, 11268)	0.408248290464
  (1, 39519)	0.353553390593
  (1, 38574)	0.353553390593
  (1, 38302)	0.353553390593
  (1, 34498)	0.353553390593
  (1, 31077)	0.353553390593
  (1, 26752)	0.353553390593
  (1, 26150)	0.353553390593
  (1, 12415)	0.353553390593
  (2, 37777)	0.333333333333
  (2, 36310)	0.333333333333
  (2, 35158)	0.333333333333
  (2, 34886)	0.333333333333
  (2, 34830)	0.333333333333
  (2, 33683)	0.333333333333
  (2, 19129)	0.333333333333
  (2, 18689)	0.333333333333
  (2, 9400)	0.333333333333
  (3, 30888)	0.333333333333
  (3, 30612)	0.333333333333
  :	:
  (6, 12411)	0.377964473009
  (7, 33899)	0.5
  (7, 20225)	0.5
  (7, 8769)	0.5
  (7, 7384)	0.5
  (8, 38201)	0.316227766017
  (8, 32219)	0.316227766017
  (8, 26671)	0.316227766017
  (8, 23854)	0.316227766017
  (8, 18110)	0.316227766017
  (8, 14457)	0.316227766017
  (8, 3285)	0.316227766017
  (8, 