Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy, matplotlib.pyplot as plt, scipy
from scipy.stats import entropy
from sklearn import tree

2 a) Data Loader

In [None]:
def load_data(clean_real_text,clean_fake_text):
    combined_text = []
    labels = []
    cv = CountVectorizer()

    with open (clean_real_text,'r') as real:
        lines = real.readlines()  #to avoid IO text wrapping errors
        for line in lines:
            line = line.strip()
            combined_text.append(line)
            labels.append(1)

    with open (clean_fake_text) as fake:
        lines = fake.readlines()   #to avoid IO text wrapping errors
        for line in lines:
            line = line.strip()
            combined_text.append(line)
            labels.append(0)


    X_train, X_test, Y_train, Y_test = train_test_split(combined_text, labels, test_size = 0.15,train_size = 0.85) #X and Y have to be the same size
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size = 0.1765, train_size = 0.8235)

    vector_training = cv.fit_transform(X_train).toarray()
    vector_validation = cv.transform(X_validation)
    vector_test = cv.transform(X_test)
    feature_names = cv.get_feature_names_out()
    feature_names = feature_names.tolist()
    outputs = [vector_training, Y_train, vector_validation, Y_validation, vector_test, Y_test, feature_names]

    return outputs

Optimal Decision Tree Classifier

In [None]:
def select_model(data, max_depths, split_criteria):
    max_depth_accuracies = []; model_characteristics = []; tree_models = [] #model_characteristics = [["max_depth","split","accuracy","model"]]

    for criteria in split_criteria:
        for depth in max_depths:

            #train decision tree
            model = DecisionTreeClassifier(criterion = criteria,max_depth = depth)  #initialize decision tree
            model.fit(data[0],data[1])   #train decision tree

            #validation accuracy assessment
            predictions = model.predict(data[2])
            model_accuracy = accuracy_score(data[3],predictions)

            #recording validation accuracies
            max_depth_accuracies.append(model_accuracy)   #set of all validation accuracies
            tree_models.append(model)
            model_characteristics.append([depth,criteria,model_accuracy,model])   #set of characteristics of all trees


    max_accuracy_index = max_depth_accuracies.index(max(max_depth_accuracies))  #find index of highest accuracy
    selected_model = tree_models[max_accuracy_index]
    selected_model_characteristics = model_characteristics[max_accuracy_index]

    print("Accuracies for each model =", max_depth_accuracies)

    #plot for Information Gain Criteria
    plt.plot(max_depths,max_depth_accuracies[0:5])
    plt.xlabel('Max Depth')
    plt.ylabel('Validation Accuracy (Information Gain Criteria)')
    plt.show()

    #plot for Gini Criteria
    plt.plot(max_depths,max_depth_accuracies[5:10])
    plt.xlabel('Max Depth')
    plt.ylabel('Validation Accuracy (Gini Criteria)')
    plt.show()

    #plot for Log Loss Criteria
    plt.plot(max_depths,max_depth_accuracies[10:15])
    plt.xlabel('Max Depth')
    plt.ylabel('Validation Accuracy (Log_Loss Criteria')
    plt.show()

    return selected_model_characteristics

Decision Tree Visualiser

In [None]:
from sklearn import tree
import graphviz

def visualizer(model, feature_names):
    dot_format = tree.export_graphviz(model, feature_names = feature_names)
    graph = graphviz.Source(dot_format)

    return graph

visualizer(Decision_Tree)

Information Gain Function

In [None]:
def compute_information_gain(dataset, keyword_xi, feature_names):
    X_training = dataset[0]; Y_training = dataset[1];

    probability_real = Y_training.count(1)/len(Y_training) #Find probability of real in training dataset
    h = entropy([probability_real, 1 - probability_real], base = 2) #Find entropy of the entire training dataset

    absence_indicator = []
    presence_indicator = []
    keyword_index = feature_names.index(keyword_xi) #get position of keyword assigned during vectorization
    keyword_presence_indicators = X_training[:,keyword_index] #get count of keywords in each line

    for indicator in keyword_presence_indicators:
            absence_indicator.append(indicator==0)  #collection of absence indication for all lines
            presence_indicator.append(indicator!=0) #collection of presence indication for all lines

    probability_absent = sum(absence_indicator)/len(absence_indicator)
    probability_present = sum(presence_indicator)/len(presence_indicator)
    if probability_absent == 0:
        return 0
    elif probability_absent == 1:
        return 0

    Y_training = numpy.array(Y_training)
    prob_real_given_absent = Y_training[absence_indicator].sum()/len(Y_training[absence_indicator])
    prob_real_given_present = Y_training[presence_indicator].sum()/len(Y_training[presence_indicator])
    h_real_given_absent = entropy ([prob_real_given_absent, 1-prob_real_given_absent], base = 2)
    h_real_given_present = entropy ([prob_real_given_present, 1-prob_real_given_absent], base = 2)

    info_gain = h - probability_present*h_real_given_present - probability_absent*h_real_given_absent
    return info_gain


Main Function

In [None]:
def main():
  information_gained = []

  max_depths = [1,5,20,50,100]

  data = load_data('clean_real.txt','clean_fake.txt')

  feature_names = data[6]

  split_criteria = ["entropy", "gini", "log_loss"]

  model = select_model(data, max_depths, split_criteria)[3]
  diagram = visualizer(model, feature_names)

  keywords = ["the","hillary","donald"]
  for keyword in keywords:
    information_gained.append(compute_information_gain(data,keyword,feature_names))

  print("Information Gained = ", information_gained)

In [None]:
if __name__ = "__main__":
  main()