In [10]:

import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import random
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

 
 
def load_data(filename, filename2):
    
    # Read Real File
    f_real = open(filename,'r', encoding='UTF8')
    rdr_real = csv.reader(f_real, delimiter='\t', quoting=csv.QUOTE_NONE)
    data_real=[]
    
    for line in rdr_real:
        str = ' '.join(line)
        data_real+=str.split(' |,')
    f_real.close()
    
    
    # Read Fake File
    f_fake = open(filename2,'r', encoding='UTF8')
    rdr = csv.reader(f_fake, delimiter='\t', quoting=csv.QUOTE_NONE) 
    data_fake=[]
 
    for line in rdr:
        str = ' '.join(line)
        data_fake+=str.split(' |,')        
    f_fake.close()
    
    # Make Full Feature Dictionary(Vectorizer)
    data_full = data_real+data_fake
    vect = CountVectorizer()
    
    vect.fit(data_full)
    feature_full = vect.vocabulary_  # Feature Vector and corresponding Feature Number    
    counter_all = vect.transform(data_full)
    result = counter_all.toarray()   # Indices for Each Senetence regarding to Feature Vectors
        
    
    # Make Feature Name for Visualizing Decision Tree
    names=[]
    keys=[]
    feature_name=[]
    
    for name,key in feature_full.items(): 
        names.append(name)
        keys.append(key)

    for i in range(len(feature_full)):  
        feature_name.append(names[keys.index(i)])
        
            
    label_all = np.zeros((np.shape(result)[0],1))
    label_all[0:len(data_real)] = 1
    
    dictionary_final = np.concatenate((result, label_all), axis = 1) # Indices for Each Sentence + Label(1:Real, 0:Fake)
        

    # Split Dataset Randomly
    sequence = [i for i in range(np.shape(dictionary_final)[0])]
    idx_all=random.sample(sequence, np.shape(dictionary_final)[0])
    
    idx_train = math.floor(np.shape(dictionary_final)[0]*0.7)
    idx_val = idx_train+math.floor(np.shape(dictionary_final)[0]*0.15)
   
    # 70%: Train, 15%: Validate, 15%: Test
    subset_train = dictionary_final[idx_all[0:idx_train]]
    subset_val =  dictionary_final[idx_all[idx_train:idx_val]]
    subset_test =  dictionary_final[idx_all[idx_val:]]
    
    return subset_train, subset_val, subset_test, feature_name


def select_model(train_data, validate_data, test_data):
    
    depth_list = [1, 10, 50, 100, 200, 500] # Max_depth
    criterion_list=["gini", "entropy"] # Criterion
    
    for i in range(len(criterion_list)):
        for j in range(len(depth_list)):
            DYTree = DecisionTreeClassifier(criterion=criterion_list[i], max_depth=depth_list[j])
            DYTree = DYTree.fit(train_data[:,0:-1], train_data[:,-1])
            validate_predict = DYTree.predict(validate_data[:,0:-1])
 
            print(criterion_list[i]+","+ str(depth_list[j])+" depth")
            print("Validating Accuracy: " + str(sum((validate_data[:,-1]-validate_predict)==0)/len(validate_data[:,-1])*100)+"%\n")

        
# Main
train_data, validate_data, test_data, name_feature = load_data('clean_real.txt', 'clean_fake.txt')
select_model(train_data, validate_data, test_data)




gini,1 depth
Validating Accuracy: 67.89366053169734%

gini,10 depth
Validating Accuracy: 72.1881390593047%

gini,50 depth
Validating Accuracy: 75.4601226993865%

gini,100 depth
Validating Accuracy: 76.27811860940696%

gini,200 depth
Validating Accuracy: 76.89161554192229%

gini,500 depth
Validating Accuracy: 75.86912065439672%

entropy,1 depth
Validating Accuracy: 67.89366053169734%

entropy,10 depth
Validating Accuracy: 69.73415132924336%

entropy,50 depth
Validating Accuracy: 74.84662576687117%

entropy,100 depth
Validating Accuracy: 76.89161554192229%

entropy,200 depth
Validating Accuracy: 77.91411042944786%

entropy,500 depth
Validating Accuracy: 75.86912065439672%



In [14]:
""" 
Run DecisionTreeClassifier(highest validation accuracy) again to visualize

"""

DYTree = DecisionTreeClassifier(criterion='entropy', max_depth=200)
DYTree = DYTree.fit(train_data[:,0:-1], train_data[:,-1])
validate_predict = DYTree.predict(validate_data[:,0:-1])

print("entropy, 200 depth")
print("Validating Accuracy: " + str(sum((validate_data[:,-1]-validate_predict)==0)/len(validate_data[:,-1])*100)+"%\n")

my_class_names = ["Fake", "Real"]
tree.export_graphviz(DYTree, max_depth=2, out_file="TreeVisualize.dot", feature_names= name_feature, class_names=my_class_names)



entropy, 200 depth
Validating Accuracy: 77.0961145194274%



In [16]:
def compute_information_gain(dataset, standard, name_features):
    loc_feature=name_features.index(standard)   
    data_from_loc=dataset[:,loc_feature]
    label=dataset[:,-1]
    
    true_real_number=0
    true_fake_number=0
    false_real_number=0
    false_fake_number=0
    true_number=0
    false_number=0

    for i in range(len(data_from_loc)):
        if(data_from_loc[i] <= 0.5):
            true_number=true_number+1
            if(dataset[i,-1]==0):
                true_fake_number=true_fake_number+1
            else:
                true_real_number=true_real_number+1
        else:
            false_number=false_number+1
            if(dataset[i,-1]==0):
                false_fake_number=false_fake_number+1
            else:
                false_real_number=false_real_number+1
              
    print("True number: {}, where real: {}, fake: {}".format(true_number, true_real_number, true_fake_number))
    print("False number: {}, where real: {}, fake: {}".format(false_number, false_real_number, false_fake_number))
    
    root_real = true_real_number+false_real_number 
    root_fake = true_fake_number+false_fake_number
    root_total = root_real+root_fake
    
    if(root_fake>root_real):
        print("Headline(Class) is Fake")
    else:
        print("Headline(Class) is Real")

    entropy_root = -((root_fake/root_total)*math.log2((root_fake/root_total))
                     +(root_real/root_total)*math.log2((root_real/root_total)))
    print("Entropy root is {}".format(entropy_root))
    entropy_left_leaf= -((true_fake_number/true_number)*math.log2((true_fake_number/true_number))
                         +(true_real_number/true_number)*math.log2((true_real_number/true_number)))
    print("Entropy left leaf is {}".format(entropy_left_leaf))
    entropy_right_leaf= -((false_fake_number/false_number)*math.log2((false_fake_number/false_number))
                          +(false_real_number/false_number)*math.log2((false_real_number/false_number)))
    print("Entropy right leaf is {}".format(entropy_right_leaf))

    IG = entropy_root-((true_number/root_total)*entropy_left_leaf
                       +(false_number/root_total)*entropy_right_leaf)
    
    
    print("Information Gain is {}\n".format(round(IG,3)))
    
        
print("Topmost split from the previous: the")
compute_information_gain(train_data,"the",name_feature)
print("Try other one: hillary")
compute_information_gain(train_data,"hillary",name_feature)
print("Try other one: trumps")
compute_information_gain(train_data,"trumps",name_feature)
print("Try other one: donald")
compute_information_gain(train_data,"donald",name_feature)








Topmost split from the previous: the
True number: 1917, where real: 1260, fake: 657
False number: 369, where real: 114, fake: 255
Headline(Class) is Real
Entropy root is 0.9703331467646472
Entropy left leaf is 0.9274006288148684
Entropy right leaf is 0.8919496672696391
Information Gain is 0.049

Try other one: hillary
True number: 2156, where real: 1356, fake: 800
False number: 130, where real: 18, fake: 112
Headline(Class) is Real
Entropy root is 0.9703331467646472
Entropy left leaf is 0.9514806205295638
Entropy right leaf is 0.5801954953637369
Information Gain is 0.04

Try other one: trumps
True number: 2126, where real: 1216, fake: 910
False number: 160, where real: 158, fake: 2
Headline(Class) is Real
Entropy root is 0.9703331467646472
Entropy left leaf is 0.9850041763214559
Entropy right leaf is 0.09694460606247315
Information Gain is 0.047

Try other one: donald
True number: 1542, where real: 796, fake: 746
False number: 744, where real: 578, fake: 166
Headline(Class) is Real
Ent