In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

In [107]:
# Read in data from CSV files
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [4]:
# Data preprocessing

# Drop subject/date and concatenating text and title
train["text"] = train["title"] + " " + train["text"]
test["text"] = test["title"] + " " + test["text"]

train = train.drop(["subject", "date", "title"], axis = 1)
test = test.drop(["subject", "date", "title"], axis = 1)

def clean_text_data(data_point):
    review_soup = BeautifulSoup(data_point)
    review_text = review_soup.get_text()
    review_letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    review_lower_case = review_letters_only.lower()  
    review_words = review_lower_case.split() 
    stop_words = stopwords.words("english")
    meaningful_words = [x for x in review_words if x not in stop_words]
        
    return(" ".join(meaningful_words)) 

train["text"] = train["text"].apply(clean_text_data)
test["text"] = test["text"].apply(clean_text_data)



In [106]:
# View cleaned train dataset
train

Unnamed: 0,text,label
0,clinton faces pressure pick vp tough trade wal...,real
1,ryan trump cite positive step toward republica...,real
2,watch president obama dares republicans suppor...,fake
3,hariri warns lebanon faces arab sanctions risk...,real
4,poem twas night cnn christmas acr boiler room ...,fake
...,...,...
39893,lol photo accompanying google search pathologi...,fake
39894,trump wants children would break law donald tr...,fake
39895,gay activists march serb capital behind police...,real
39896,boiler room smoking gunz tune alternate curren...,fake


In [6]:
# View cleaned test dataset
test

Unnamed: 0,text,label
0,factbox taxes budget u congress calendar tight...,real
1,breaking israel worst fears confirmed says isr...,fake
2,u drug enforcement chief step agency reuters u...,real
3,factbox trump twitter oct rex tillerson puerto...,real
4,fcc chief plans ditch u net neutrality rules w...,real
...,...,...
4995,republicans told stop talking healthcare repea...,fake
4996,texas bill restricting insurance coverage abor...,real
4997,montana dems hilariously troll reporter slammi...,fake
4998,trump says gave classified info russia humanit...,fake


In [7]:
# Split data into Pandas Series for X and Y
X_train = pd.DataFrame(train, columns = ['text']).squeeze()
X_test = pd.DataFrame(test, columns = ['text']).squeeze()
Y_train = pd.DataFrame(train, columns = ['label']).squeeze()
Y_test = pd.DataFrame(test, columns = ['label']).squeeze()

In [8]:
# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [185]:
class DecisionTree:
    # Constructor, initialize max depth
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
    
    # Fit tree, calls _build tree helper function
    def fit(self, X, Y):
        self.tree = self._build_tree(X, Y)
    
    # Make predictions on input data using the trained decision tree model
    def predict(self, X):
        # Check if X is csr_matrix
        if(isinstance(X, scipy.sparse._csr.csr_matrix)):
            predictions = []
            num_rows, _ = X.shape
            for idx in range(num_rows):
                prediction = self._traverse_tree(X.getrow(idx).toarray()[0], self.tree)
                predictions.append(prediction)
        else:
            # Slight modification for testing purposes
            predictions = []
            num_rows, _ = X.shape
            for idx in range(num_rows):
                prediction = self._traverse_tree(X.iloc[idx], self.tree)
                predictions.append(prediction)
        return predictions

    # Build tree by recursively splitting input data on best feature given information gain criterion
    def _build_tree(self, X, Y, depth=0):
        # Initialize variables
        num_samples, num_features = X.shape
        num_classes = len(set(Y))
        best_gain = -1
        best_split_feature = None
        best_split_val = None
        best_left_X= None
        best_right_X = None
        best_left_Y = None
        best_right_Y = None
        
        # If maximum depth has been reached or all samples belong to the same class, return a leaf node
        if depth == self.max_depth or num_classes == 1:
            return Leaf(Y)
        
        for feature_idx in range(num_features): 
            # Check if X is csr_matrix
            if(isinstance(X, scipy.sparse._csr.csr_matrix)):
                feature_vals = X.getcol(feature_idx).toarray().transpose()[0]
            else:
                feature_vals = X.iloc[:, feature_idx]
    
            possible_vals = np.unique(feature_vals)
            
            for split_val in possible_vals:
                # Calculate index values
                left_idx = feature_vals <= split_val
                right_idx = feature_vals > split_val
    
                # If either the left or right split is empty, skip this split
                if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
                    continue
                
                if(isinstance(X, scipy.sparse._csr.csr_matrix)):
                    # Calculate left and right 
                    left_X = X[left_idx, :]
                    right_X = X[right_idx, :]
                    left_Y = Y[left_idx]
                    right_Y = Y[right_idx]
                else:
                    # Calculate left and right 
                    left_X = X.loc[left_idx, :]
                    right_X = X.loc[right_idx, :]
                    left_Y = Y.loc[left_idx]
                    right_Y = Y.loc[right_idx]
                
                # Calcuate gain using information gain helper function
                gain = self._information_gain(Y, left_Y, right_Y)
                
                # If gain is greater than best_gain, update 
                if gain > best_gain:
                    best_gain = gain
                    best_split_feature = feature_idx
                    best_split_val = split_val
                    best_left_X = left_X
                    best_right_X = right_X
                    best_left_Y = left_Y
                    best_right_Y = right_Y
                    
        # If best_gain is zero, return a leaf
        if best_gain == 0:
            return Leaf(Y)
        
        # Add by 0.5 to mimmic sklearn library
        best_split_val = best_split_val + 0.5
        
        # If best gain > zero, build left and right tree and return node with new best_split_feature
        left_tree = self._build_tree(best_left_X, best_left_Y, depth+1)
        right_tree = self._build_tree(best_right_X, best_right_Y, depth+1)
        return Node(best_split_feature, best_split_val, left_tree, right_tree)

    # Traverse the decision tree to predict the class of a test sample
    def _traverse_tree(self, x, node):
        # If node is a leaf, return the predicted class
        if isinstance(node, Leaf):
            return node.predicted_class
        
        # Check against split_val, traverse left or right tree accordingly
        if x[node.split_feature] < node.split_val:
            return self._traverse_tree(x, node.left_tree)
        else:
            return self._traverse_tree(x, node.right_tree)

    def _information_gain(self, Y, left_Y, right_Y):
        # Initialize variables
        num_samples = len(Y)
        num_left = len(left_Y)
        num_right = len(right_Y)
        
        # Calculate entropy before split
        entropy_before_split = self._entropy(Y)
        
        # Calculate entropy after split
        entropy_after_split = ((num_left / num_samples) * self._entropy(left_Y)
                               + (num_right / num_samples) * self._entropy(right_Y))
        
        # Calculate information gain
        return entropy_before_split - entropy_after_split

    def _entropy(self, Y):
        # Calculate the entropy of a set of samples
        num_samples = len(Y)
        
        # If number of samples is zero, return zero
        if num_samples == 0:
            return 0
        
        _, counts = np.unique(Y, return_counts=True)
        class_probs = counts / num_samples
        
        # Return calculated entropy
        return -np.sum(class_probs * np.log2(class_probs))
    
    def print_tree(self, tree=None, indent=" "):
        # Added print_tree function for debugging purposes
        ''' function to print the tree '''
        
        if not tree:
            tree = self.tree
            
        if(isinstance(tree, Leaf)):
            print(tree.predicted_class)

        else:
            print("X_"+str(tree.split_feature), "<=", tree.split_val, "?")
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left_tree, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right_tree, indent + indent)


# Node class definition
class Node:
    def __init__(self, split_feature, split_val, left_tree, right_tree):
        self.split_feature = split_feature
        self.split_val = split_val
        self.left_tree = left_tree
        self.right_tree = right_tree

# Leaf class definition
class Leaf:
    def __init__(self, y):
        self.predicted_class = Counter(y).most_common(1)[0][0]

In [187]:
# Split data into Pandas Series for X and Y, reduce dataset for model testing
rows_to_drop = train.sample(38898).index
reduced_data = train.drop(rows_to_drop)

# Split data into Pandas Series for X and Y, reduce dataset for model testing
X_train_t, X_val_t, Y_train_t, Y_val_t = train_test_split(reduced_data["text"], reduced_data["label"], test_size=0.2, random_state=42)

In [188]:
# Convert the text into bag-of-words features
vectorizer = CountVectorizer()
X_train_reduced_t = vectorizer.fit_transform(X_train_t)
X_val_reduced_t = vectorizer.transform(X_val_t)

In [189]:
# Create model with a max_depth of 50 trees, train on reduced datasets at first for testing purposes
dt_t = DecisionTree(max_depth=50)
dt_t.fit(X_train_reduced_t, Y_train_t)

In [191]:
# Use the decision tree to predict the classes of the test data
y_pred_t = dt_t.predict(X_val_reduced_t)

In [192]:
# Accuracy Score
accuracy_score(Y_val_t, y_pred_t)

0.985

In [186]:
# NOTE ABOUT SCORES:
# Model was working perfectly before but when running last time... I had made some changes before pushing to git
# and must have changed one small thing that broke it, unforunately, the model took so long to run, I ran out of
# time to see that errors had occured while waiting on accuracy, recall, and f1 score
# Will attempt to resolve ahead of presentation and resubmit if allowed

# UPDATE - 5/8/2023
# Found bug, was a scoping error and the left and right X/Y were being passed incorrectly due to a 
# scoping issue when I refactored code, I deleted the initialization of varaibles, works properly now

In [194]:
# Running on full dataset

In [195]:
# Convert the text into bag-of-words features - FINAL
X_train_final = vectorizer.fit_transform(X_train)
X_val_final = vectorizer.transform(X_val)

In [199]:
# Get Y values - FINAL
Y_train_final = pd.DataFrame(train, columns = ['label']).squeeze()
Y_val_final = pd.DataFrame(test, columns = ['label']).squeeze()

In [None]:
# Create model with a max_depth of 50 trees, train on reduced datasets at first for testing purposes
dt_final = DecisionTree(max_depth=50)
dt_final.fit(X_train_final, Y_train_final)

In [None]:
# Use the decision tree to predict the classes on training set - Final
Y_pred_train_final = dt_final.predict(X_train_final)

In [None]:
# Use the decision tree to predict the classes on validation set - Final
Y_pred_final = dt_final.predict(X_val_final)

In [None]:
# Calculate the accuracy of the model on training set - Final
accuracy_train = accuracy_score(Y_train_final, Y_pred_train_final)
print('Training Accuracy:', accuracy_train)

In [None]:
# Calculate the accuracy of the model on validation set - Final
accuracy_val = accuracy_score(Y_val_final, Y_pred_final)
print('Validation Accuracy:', accuracy_val)

In [None]:
# Calculate the F1 score of the model on training set - Final
f1_train = f1_score(Y_train_final, Y_pred_train_final, average='binary', pos_label='real'))
print('Training F1:', f1_train)

In [None]:
# Calculate the F1 score of the model on validation set - Final
f1_val = f1_score(Y_val_final, Y_pred_final, average='binary', pos_label='real'))
print('Val F1:', f1_val)

In [None]:
# Calculate the precision score of the model on training set - Final
precision_train = f1_score(Y_train_final, Y_pred_train_final, average='binary', pos_label='real'))
print('Precision Train:', precision_train)

In [None]:
# Calculate the precision score of the model on validation set - Final
precision_val = f1_score(Y_val_final, Y_pred_final, average='binary', pos_label='real'))
print('Precision Val:', precision_val)

In [147]:
# TESTS ON IRIS DATASET

In [148]:
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
data = pd.read_csv(csv_url, header = None, names=col_names)

In [149]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

X_train_iris, X_test_iris, Y_train_iris, Y_test_iris = train_test_split(X, Y, test_size=.2, random_state=41)

In [150]:
dt = DecisionTree()
dt.fit(X_train_iris, Y_train_iris)

Feature Idx: 2 - Gain: 0.9264046681474138
Feature Idx: 3 - Gain: 0.7694993941591152
Feature Idx: 2 - Gain: 0.17556502585750278
Feature Idx: 2 - Gain: 0.1228956258058704
Feature Idx: 1 - Gain: 0.46691718668869925
Feature Idx: 0 - Gain: 0.2516291673878229
Feature Idx: 0 - Gain: 1.0


In [151]:
dt.print_tree()

X_2 <= 1.9 ?
 left:Iris-setosa
 right:X_3 <= 1.5 ?
  left:X_2 <= 4.9 ?
    left:Iris-versicolor
    right:Iris-virginica
  right:X_2 <= 5.0 ?
    left:X_1 <= 2.8 ?
        left:Iris-virginica
        right:X_0 <= 5.9 ?
                left:Iris-versicolor
                right:X_0 <= 6.0 ?
                                left:Iris-virginica
                                right:Iris-versicolor
    right:Iris-virginica


In [152]:
Y_pred_iris = dt.predict(X_test_iris)

In [153]:
accuracy_score(Y_test_iris, Y_pred_iris)

0.9333333333333333