#DATASET 2

In [183]:
# packages
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [184]:
# # import csv files to Google Colab
# from google.colab import files
# uploaded = files.upload()

In [185]:
# Read csv file and rawdata in dataframe type
def read_file(path):
    rawdata = pd.read_csv(path, header=0, delimiter='\t')
    return rawdata

In [186]:
# Preprocess data before training
from keras.preprocessing.text import text_to_word_sequence
def preprocess_data(df):
    reviews = []
    for raw in tqdm(df['Phrase']):
        # print(raw)
        text = raw.replace("'","")
        tokenized_train_data = text_to_word_sequence(text,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',split=" ")
        sw = set(stopwords.words('english'))
        removesw = [i for i in tokenized_train_data if not i in sw]
        rswtext = ' '.join(removesw)
        numberRemove = ''.join(num for num in rswtext if not num.isdigit())
        stemmer = PorterStemmer()
        stem_input = nltk.word_tokenize(numberRemove)
        stem_text = ' '.join([stemmer.stem(word) for word in stem_input])
        reviews.append(stem_text)
    return reviews

In [187]:
# Tokenizer preprocess before training
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)
    len(list(unique_words)), len_max

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = sequence.pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = sequence.pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val

In [188]:
# Show the accuracy and other matrices of accuracy
def report(predictions, y_test):
    print('Accuracy: %s' % accuracy_score(y_test, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, predictions))
    print('Classification Report:')
    print(classification_report(y_test, predictions))

In [189]:
# functions used to construct the decision tree classifier
def gini(sequence, weights=None):
    if weights is None: # count 1 - sum of square of probabilities
        _, counts = np.unique(sequence, return_counts=True)
        p = (counts / len(sequence)) ** 2  
        return 1.0 - np.sum(p)
    else:
        tot = 0
        weights = weights / weights.sum()
        for c in np.unique(sequence):
            # change prob become weighted prob
            tot = np.sum(weights[sequence == c]) ** 2
        return 1 - tot


def entropy(sequence, weights=None):
    if weights is None:
        _, counts = np.unique(sequence, return_counts=True)
        p = counts / len(sequence)  # probability
        return -np.sum(p * np.log2(p))
    else:
        entropy = 0
        weights = weights / weights.sum()
        for c in np.unique(sequence):
            # the weighted probability
            tmp = np.sum(weights[sequence == c])
            entropy -= tmp * np.log2(tmp)
        return entropy

In [190]:
# Node
class Node():
    def __init__(self, c_value, prediction):
        self.c_value = c_value
        self.prediction = prediction
        self.feature_idx = None
        self.threshold = None
        self.left = None
        self.right = None

In [191]:
# Decision tree built from scratch using gini and entropy
class DecisionTree():
    def __init__(self, criterion='gini', max_depth=None, max_features=None):
        self.criterion = globals()[criterion]

        self.max_depth = max_depth if max_depth is not None else 2 ** 100
        self.max_features = max_features
        self.n_features = None

    def fit(self, x_data, y_data, sample_weight=None):
        self.n_features = x_data.shape[1]
        self.root = self.get_node(x_data, y_data, depth=0, sample_weight=sample_weight)

    def get_node(self, x, y, depth, sample_weight=None):
        weighted_counts = np.bincount(y, weights=sample_weight)
        prediction = np.argmax(weighted_counts)
            
        node = Node(c_value=self.criterion(y, sample_weight), prediction=prediction)
        if depth >= self.max_depth:
            return node

        node.feature_idx, node.threshold = self.best_split(x, y, sample_weight)
        if node.feature_idx is None:
            return node

        left_idx = x[:, node.feature_idx] < node.threshold
        x_left, y_left = x[left_idx], y[left_idx]
        x_right, y_right = x[~left_idx], y[~left_idx]

        # get child nodes recursively
        if sample_weight is not None:
            node.left = self.get_node(x_left, y_left, depth=depth + 1, sample_weight=sample_weight[left_idx])
            node.right = self.get_node(x_right, y_right, depth=depth + 1, sample_weight=sample_weight[~left_idx])
        else:
            node.left = self.get_node(x_left, y_left, depth=depth + 1)
            node.right = self.get_node(x_right, y_right, depth=depth + 1)

        return node

    def best_split(self, x, y, sample_weight):
        if len(y) <= 1:  
            return None, None

        parent_c = self.criterion(y, sample_weight)
        best_infog = -2 ** 64  
        best_idx, best_th = None, None

        if self.max_features is not None:
            available_features = np.random.choice(np.arange(self.n_features), size=self.max_features, replace=False)
        else:
            available_features = np.arange(self.n_features)

        for idx in available_features:
            sort_idx = np.argsort(x[:, idx])
            thresholds = x[sort_idx, idx]
            labels = y[sort_idx]

            for pos in range(1, len(y)):
                if thresholds[pos] == thresholds[pos - 1]:
                    continue

                if sample_weight is not None:
                    sorted_sample_weight = sample_weight[sort_idx]
                    left_c = self.criterion(
                        labels[:pos], sorted_sample_weight[:pos])
                    right_c = self.criterion(
                        labels[pos:], sorted_sample_weight[pos:])
                else:
                    left_c = self.criterion(labels[:pos])
                    right_c = self.criterion(labels[pos:])

                child_c = (pos * left_c + (len(y) - pos) * right_c) / len(y)
                infog = parent_c - child_c

                if infog > best_infog:
                    best_infog = infog
                    best_idx = idx
                    best_th = (thresholds[pos] + thresholds[pos - 1]) / 2

        return best_idx, best_th

    def predict(self, x_data):
        def util(self, x):
            cur_node = self.root
            while cur_node.left and cur_node.right:
                if x[cur_node.feature_idx] < cur_node.threshold:
                    cur_node = cur_node.left
                else:
                    cur_node = cur_node.right
                    
            return cur_node.prediction
        return np.stack([util(self, single_x) for single_x in x_data])

MAIN CODE

In [192]:
# Read the csv files inserted
df = read_file('./X_train.csv')
df2 = read_file('./y_train.csv')
df3 = read_file('./X_test.csv')

---> TRAIN

In [193]:
# preprocess training and testing data
train_text = preprocess_data(df)
test_text = preprocess_data(df3)
target = df2.Sentiment.values

100%|██████████| 124848/124848 [01:00<00:00, 2070.20it/s]
100%|██████████| 31211/31211 [00:14<00:00, 2113.23it/s]


In [194]:
# split validation from training dataset
X_train, X_val, y_train, y_val = train_test_split(train_text, target, test_size=0.2, stratify=target)

In [195]:
# save preprocessed training data to a new csv file
final_traindf = pd.DataFrame(train_text) 
final_traindf.to_csv('X_train_final.csv') 

In [196]:
# save preprocessed testing data to a new csv file
final_traindf = pd.DataFrame(test_text) 
final_traindf.to_csv('X_test_final.csv') 

In [197]:
# tokenize the training and validation data
X_train_, X_val_ = tokenizer_preprocess(X_train, X_val) 
# X_train_, X_val_

100%|██████████| 99878/99878 [00:00<00:00, 811100.28it/s]


In [198]:
# init decision tree classifier
clf = DecisionTree(criterion='entropy')
clf.fit(X_train_, y_train)

In [199]:
# predict validation set
y_pred = clf.predict(X_val_)
report(y_pred, y_val)

Accuracy: 0.5243091710052062
Confusion Matrix:
[[   36    79   969    36    12]
 [   42   223  3935   128    36]
 [   11   129 12284   239    70]
 [    6    88  4627   444   103]
 [    7    32  1148   181   105]]
Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.03      0.06      1132
           1       0.40      0.05      0.09      4364
           2       0.53      0.96      0.69     12733
           3       0.43      0.08      0.14      5268
           4       0.32      0.07      0.12      1473

    accuracy                           0.52     24970
   macro avg       0.41      0.24      0.22     24970
weighted avg       0.47      0.52      0.41     24970



---> TEST

In [200]:
# tokenize before testing
X_train_, X_test_ = tokenizer_preprocess(X_train, test_text)

100%|██████████| 99878/99878 [00:00<00:00, 506128.11it/s]


In [201]:
# predict testing dataset
ytest_pred = clf.predict(X_test_)

In [202]:
# write numpy array to csv
np.savetxt("y_test_final.csv", ytest_pred, delimiter=",")

In [203]:
# Reference of sklearn package DecisionTreeClassifier
# model = DecisionTreeClassifier()
# model.fit(X_train_, y_train)
# predictions = model.predict(X_val_)
# report(predictions, y_val)

In [204]:
# finpredictions = model.predict(X_test_)