## Imports

In [None]:
import math
import sys
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
import graphviz
from IPython.display import Image
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

np.set_printoptions(threshold=sys.maxsize)

from sklearn.model_selection import train_test_split

In [None]:
vectorizer = CountVectorizer()

In [None]:
def load_data():

    #open, read, strip real_news
    real = open('clean_real.txt')
    real = real.readlines()
    real = [line.strip() for line in real]
    y = [True for i in range(len(real))]

    #open, read, strip fake news
    fake = open('clean_fake.txt')
    fake = fake.readlines()
    fake = [line.strip() for line in fake]
    y += [False for i in range(len(fake))]

    all_news = real + fake

    global vectorizer
    X = vectorizer.fit_transform(all_news)
    X = X.toarray()



    X_train, X_test, y_train, y_test \
        = train_test_split(X, y, test_size=0.3)

    X_test, X_val, y_test, y_val \
        = train_test_split(X_test, y_test, test_size=0.5)

    return {
        'X_train': X_train,
        'X_test': X_test,
        'X_val': X_val,
        'y_train': y_train,
        'y_test': y_test,
        'y_val': y_val
    }

In [None]:
def train_tree(X_train, y_train, max_depth, criterion):
    # Creating the classifier object
    clf = DecisionTreeClassifier(criterion=criterion,
                                      max_depth=max_depth)

    # Performing training
    clf.fit(X_train, y_train)
    return clf

# Function to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    return y_pred

def validate_tree(clf, X_test, y_test):
    y_pred = prediction(X_test, clf)
    # print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    # print("Report : " classification_report(y_test, y_pred))
    accurancy = accuracy_score(y_test, y_pred) * 100
    print("Accuracy : ",
      accurancy)

    return accurancy

In [None]:
def select_tree_model(leaning_data):

    # I have choosen scaler multipications of log2(len(test_data) from 1 to 5 for "sensible" max depth
    trainig_size_log = int(math.log2(len(learning_data['X_train'])))

    max_accuracy = 0
    best_clf = None

    # tree trining
    for criterion in ['gini', 'entropy']:
        for i in range(1, 5):
            max_depth = int(1.5 * i * trainig_size_log)
            clf = train_tree(learning_data['X_train'], learning_data['y_train'], max_depth, criterion)
            print("Results for ", criterion, " index, max depth of ", max_depth, " :")
            accuracy = validate_tree(clf, learning_data['X_val'], learning_data['y_val'])
            if(accuracy > max_accuracy):
                max_accuracy = accuracy
                best_clf = clf
    return max_accuracy, best_clf


In [None]:
learning_data = load_data()

feature_names = vectorizer.get_feature_names()
accuracy, clf = select_tree_model(learning_data)

print("Max accuracy on validation sets: {}".format(accuracy))
print("-----\n Best tree gets this accuracy on the training set:\n")
validate_tree(clf, learning_data['X_test'], learning_data['y_test'])


dot_data = tree.export_graphviz(clf,feature_names=feature_names, filled=True, class_names=['Fake', 'Real'])
graph = graphviz.Source(dot_data)
graph.render('my-tree')