In [1]:
import re

import json
from typing import List, Dict, Tuple, Set, Any, Callable
from functools import partial

import numpy as np
import numpy.typing as npt


# Define Constants


In [2]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"
OUTPUT_PATH = "./output"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"
OUTPUT_FILENAME = "output.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

TYPE_VANILLA_PERPCETRON = "vanilla_perceptron"
TYPE_AVERAGED_PERCEPTRON = "averaged_perceptron"

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

OUTPUT_FILE_PATH = f"{OUTPUT_PATH}/{OUTPUT_FILENAME}"

DEV_DATA_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_KEY_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_ID_COL = 0
TRAIN_DATA_COL = 3
DEV_DATA_COL = 1
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1
TEST_SIZE = 0.2


In [3]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [4]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [5]:
input_regex = re.compile("(\w*) (\w*) (\w*) (.*)\n?")


In [6]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [7]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [8]:
np.array(data)[0, 1]


'Fake'

In [9]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = "(\w*) (\w*) (\w*) (.*)\n?"
    if type == "DEV":
        regex = "(\w*) (.*)\n?"
    elif type == "KEY":
        regex = "(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [10]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [11]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [12]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)

    return (model_data["tf_idf_model"], model_data["sentiment_classifier"], model_data["truthfulness_classifier"])


In [13]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [14]:
def calculate_accuracy_score(y_true: npt.NDArray, y_pred: npt.NDArray):
    return (y_true == y_pred).sum() / y_true.shape[0]


In [15]:
def calculate_f1_score(y_true: npt.NDArray, y_pred: npt.NDArray, average: str = "macro"):
    def calculate_f1(y_true, y_pred, label):
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_pred != label) & (y_true == label))

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def macro_f1(y_true, y_pred):
        return np.mean([calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)])

    def micro_f1(y_true, y_pred):
        return {label: calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)}

    if average == "macro":
        return macro_f1(y_true, y_pred)
    elif average == "micro":
        return micro_f1(y_true, y_pred)
    else:
        return {"micro": micro_f1(y_true, y_pred), "macro": macro_f1(y_true, y_pred)}


In [16]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("---------------------------------------------------------")


# Load Data


In [17]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")

dev_raw_data = load_data(DEV_DATA_FILE_PATH, type="DEV")
dev_key_data = load_data(DEV_KEY_FILE_PATH, type="KEY")


# Data Cleaning


In [18]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [19]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [20]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [21]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [22]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [23]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [24]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [25]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [26]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    from contractions import fix

    def contraction_fixer(txt: str):
        return " ".join([fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [27]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    TRAIN_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [28]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


In [29]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DEV_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

dev_cleaned_data = dev_raw_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = dev_cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    dev_cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


# Data Preprocessing


Not Applicable since everything has to be implemented from scratch.


# Feature Extraction


In [30]:
class TfIdf:
    # Implement low frequency terms and other techniques
    def __init__(self) -> None:
        self.n_docs: int = None
        self.vocab: List = list()
        self.vocab_size: int = None
        self.vocab_index: Dict[str, int] = dict()
        self.word_document_count: Dict[str, int] = dict()

    def __create_vocab__(self, documents: npt.NDArray) -> Set:
        vocab = set()

        for document in documents:
            for word in document:
                vocab.add(word)

        return list(vocab)

    def __get_word_document_count__(self, documents: npt.NDArray):
        word_document_count = dict()

        for document in documents:
            for word in document:
                if word in self.vocab:
                    if word not in word_document_count:
                        word_document_count[word] = 1
                    else:
                        word_document_count[word] += 1

        return word_document_count

    def __term_frequency__(self, word: str, document: npt.NDArray):
        word_occurences = (document == word).sum()
        return word_occurences / self.n_docs

    def __inverse_document_frequency__(self, word: str):
        word_occurrences = 1

        if word in self.word_document_count:
            word_occurrences += self.word_document_count[word]

        return np.log(self.n_docs / word_occurrences)

    def __tf_idf__(self, document: npt.NDArray):
        tf_idf_vector = np.zeros(shape=(self.vocab_size,))
        for word in document:
            # ignore word not in vocab
            if word in self.vocab:
                tf = self.__term_frequency__(word, document)
                idf = self.__inverse_document_frequency__(word)

                tf_idf_vector[self.vocab_index[word]] = tf * idf
        return tf_idf_vector

    def fit(self, documents: npt.NDArray):
        self.n_docs = documents.shape[0]
        self.vocab = self.__create_vocab__(documents)
        self.vocab_size = len(self.vocab)
        self.vocab_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.word_document_count = self.__get_word_document_count__(documents)

    def transform(self, documents: npt.NDArray):
        tf_idf_vectors = list()
        for document in documents:
            tf_idf_vectors.append(self.__tf_idf__(document))
        return np.array(tf_idf_vectors)

    def export(self):
        return {
            "n_docs": self.n_docs,
            "vocab_size": self.vocab_size,
            "vocab": self.vocab,
            "vocab_size": self.vocab_size,
            "vocab_index": self.vocab_index,
            "word_document_count": self.word_document_count,
        }

    def load(self, tf_idf_model_data):
        self.n_docs = tf_idf_model_data["n_docs"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab = tf_idf_model_data["vocab"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab_index = tf_idf_model_data["vocab_index"]
        self.word_document_count = tf_idf_model_data["word_document_count"]


In [31]:
def tokenize(data: npt.NDArray):
    tokenized_documents = list()
    for document in data:
        tokenized_documents.append(np.array(document.split()))
    return np.array(tokenized_documents, dtype=object)


In [32]:
final_data = load_data(CLEANED_DATA_FILE_PATH)


In [33]:
train_tokenized = tokenize(final_data[:, TRAIN_DATA_COL])
dev_tokenized = tokenize(dev_cleaned_data[:, DEV_DATA_COL])


In [34]:
tf_idf_model = TfIdf()
tf_idf_model.fit(train_tokenized)


In [35]:
X_train_tf_idf_vectors = tf_idf_model.transform(train_tokenized)
X_dev_tf_idf_vectors = tf_idf_model.transform(dev_tokenized)

X_train_tf_idf_vectors.shape, X_dev_tf_idf_vectors.shape

tf_idf_model_data = tf_idf_model.export()


In [36]:
y_train_sentiment = np.where(final_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_train_truthfulness = np.where(final_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)

y_dev_sentiment = np.where(dev_key_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_dev_truthfulness = np.where(dev_key_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)


# Split Data


In [37]:
def train_test_split(X: npt.NDArray, y: npt.NDArray, test_size: float = 0.2):
    if 0 == test_size:
        return X, None, y, None

    n_max = X.shape[0]
    sample = int((1 - test_size) * n_max)

    # Shuffle the data
    all_idx = np.random.permutation(n_max)
    train_idx, test_idx = all_idx[:sample], all_idx[sample:]

    X_train, X_test, y_train, y_test = (
        X[train_idx],
        X[test_idx],
        y[train_idx],
        y[test_idx],
    )

    return X_train, X_test, y_train, y_test


# Perceptron Models


## Vanilla Perceptron


In [38]:
class VanillaPerceptron:
    def __init__(
        self,
        max_iterations: int,
        learning_rate: float = 1e-2,
        tolerance: float = 1e-2,
        val_ratio: float = 0.2,
        shuffle: bool = True,
        class_weights: dict = None,
        debug: bool = False,
        debug_at: int = 50,
        score_func: Callable[[npt.NDArray, npt.NDArray], float] = calculate_f1_score,
    ) -> None:
        self.type= TYPE_VANILLA_PERPCETRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.val_ratio = val_ratio
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.debug = debug
        self.debug_at = debug_at
        self.calculate_score = score_func
        self.best_epoch = 0

    def fit(
        self,
        X: npt.NDArray,
        y: npt.NDArray,
    ):
        n_epoch = 0

        self.weights: npt.NDArray = np.random.rand(X.shape[-1])
        self.bias: float = 0.0

        X_train, X_val, y_train, y_val = train_test_split(X, y, self.val_ratio)
        best_val_score = -1

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.random.permutation(X.shape[0])
                X = X[idxs]
                y = y[idxs]

            for x, y_true in zip(X_train, y_train):

                a = np.dot(self.weights, x) + self.bias
                if y_true * a <= 0:
                    if self.class_weights is None:
                        self.weights = self.weights + y_true * x * self.learning_rate
                    else:
                        self.weights = self.weights + y_true * x * self.class_weights[y_true] * self.learning_rate
                    self.bias = self.bias + y_true

            if self.val_ratio != 0:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score
                    self.best_epoch = n_epoch

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)

        return self.best_epoch, best_val_score

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(np.dot(self.weights, x) + self.bias)
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"type": self.type, "max_iterations": self.best_epoch, "weights": self.weights.tolist(), "bias": float(self.bias)}

    def load(self, model_data: Dict[str, Any]):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]


## Sentiment Classification


In [40]:
from sklearn.metrics import f1_score

vanilla_perceptron_sentiment = VanillaPerceptron(
    max_iterations=1000,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.2,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)

best_epoch, val_score = vanilla_perceptron_sentiment.fit(X_train_tf_idf_vectors, y_train_sentiment)

del vanilla_perceptron_sentiment


  precision = tp / (tp + fp)


Epoch # 50  Train:  0.5520319275490234  Val:  0.5096385963684708
Epoch # 100  Train:  0.4334052225991996  Val:  0.3700463201235203
Epoch # 150  Train:  0.6389837169696457  Val:  0.5104364326375711
Epoch # 200  Train:  0.6957097041871314  Val:  0.5475433015199718
Epoch # 250  Train:  0.7517271439468235  Val:  0.6000000000000001
Epoch # 300  Train:  0.835522943510164  Val:  0.6713712505962568
Epoch # 350  Train:  0.9330213087589756  Val:  0.8333333333333334
Epoch # 400  Train:  0.9555252759231503  Val:  0.885104994015885
Epoch # 450  Train:  0.9673548779803551  Val:  0.8901212699277831
Epoch # 500  Train:  0.9739130434782608  Val:  0.8952765353987129
Epoch # 550  Train:  0.9765266825600587  Val:  0.9055944055944056
Epoch # 600  Train:  0.981758462498982  Val:  0.9207375808449154
Epoch # 650  Train:  0.9895731508634735  Val:  0.9205276083777145
Epoch # 700  Train:  0.993483739044112  Val:  0.9093912222746579
Epoch # 750  Train:  0.9921810839243899  Val:  0.9090883770158482
Epoch # 800  Tr

In [41]:
best_epoch, val_score


(602, 0.9259259259259259)

In [42]:
vanilla_perceptron_sentiment = VanillaPerceptron(
    max_iterations=602,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.0,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)
vanilla_perceptron_sentiment.fit(X_train_tf_idf_vectors, y_train_sentiment)

vanilla_perceptron_sentiment_data = vanilla_perceptron_sentiment.export()


In [43]:
y_dev_sentiment_pred = vanilla_perceptron_sentiment.predict(X_dev_tf_idf_vectors)

calculate_scores(y_train_sentiment, vanilla_perceptron_sentiment.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       480
           1       0.99      0.99      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.88      0.97      0.92       160
           1       0.97      0.86      0.91       160

    accuracy                           0.92       320
   macro avg       0.92      0.92      0.92       320
weighted avg       0.92      0.92      0.92       320

---------------------------------------------------------


In [100]:
neg_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
pos_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")


## Truthfulness Classification


In [77]:
vanilla_perceptron_truthfulness = VanillaPerceptron(
    max_iterations=1000,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.2,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)

best_epoch, best_val_score = vanilla_perceptron_truthfulness.fit(X_train_tf_idf_vectors, y_train_truthfulness)

del vanilla_perceptron_truthfulness


Epoch # 50  Train:  0.4444846292947558  Val:  0.41711426188490414
Epoch # 100  Train:  0.5774456627775394  Val:  0.5194593918157928
Epoch # 150  Train:  0.750784803409211  Val:  0.56640625
Epoch # 200  Train:  0.7965052604803324  Val:  0.5552123552123552
Epoch # 250  Train:  0.8241533903711538  Val:  0.578125
Epoch # 300  Train:  0.8647065325522272  Val:  0.6002775850104094
Epoch # 350  Train:  0.8960909067554954  Val:  0.6031513045562111
Epoch # 400  Train:  0.9556991232118135  Val:  0.7677844969763605
Epoch # 450  Train:  0.9713510580575149  Val:  0.8176638176638176
Epoch # 500  Train:  0.9817703388221253  Val:  0.8226086956521741
Epoch # 550  Train:  0.99088527758704  Val:  0.827558990828185
Epoch # 600  Train:  0.9947916666666666  Val:  0.8326797385620914
Epoch # 650  Train:  0.9934895722954173  Val:  0.8219701101778117
Epoch # 700  Train:  0.9947916666666666  Val:  0.8219701101778117
Epoch # 750  Train:  0.9960937433772504  Val:  0.8270695160894129
Epoch # 800  Train:  0.996093743

In [78]:
best_epoch, best_val_score


(533, 0.8381861185873909)

In [101]:
vanilla_perceptron_truthfulness = VanillaPerceptron(
    max_iterations=533,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.0,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)
vanilla_perceptron_truthfulness.fit(
    X_train_tf_idf_vectors,
    y_train_truthfulness,
)

vanilla_perceptron_truthfulness_data = vanilla_perceptron_truthfulness.export()


In [102]:
y_dev_truthfulness_pred = vanilla_perceptron_truthfulness.predict(X_dev_tf_idf_vectors)

calculate_scores(y_train_truthfulness, vanilla_perceptron_truthfulness.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.98      0.97      0.98       480
           1       0.97      0.98      0.98       480

    accuracy                           0.98       960
   macro avg       0.98      0.98      0.98       960
weighted avg       0.98      0.98      0.98       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.88      0.77      0.82       160
           1       0.79      0.89      0.84       160

    accuracy                           0.83       320
   macro avg       0.84      0.83      0.83       320
weighted avg       0.84      0.83      0.83       320

---------------------------------------------------------


#### Averaging the Scores


In [103]:
fake_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")
truth_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [104]:
neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score


(0.9198813056379822,
 0.9108910891089109,
 0.8200000000000001,
 0.8411764705882352)

In [105]:
np.mean([neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score])


0.872987216333782

#### Write Vanilla Models


In [107]:
vanilla_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": vanilla_perceptron_sentiment_data,
    "truthfulness_classifier": vanilla_perceptron_truthfulness_data,
}

store_model(VANILLA_MODEL_FILE_PATH, vanilla_model_file_data)


#### Load Vanilla Models


In [108]:
tf_idf_model_data, vanilla_perceptron_sentiment_data, vanilla_perceptron_truthfulness_data = load_model(
    VANILLA_MODEL_FILE_PATH
)


In [109]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

vanilla_perceptron_sentiment_saved = VanillaPerceptron(vanilla_perceptron_sentiment_data["max_iterations"])
vanilla_perceptron_sentiment_saved.load(vanilla_perceptron_sentiment_data)

vanilla_perceptron_truthfulness_saved = VanillaPerceptron(vanilla_perceptron_truthfulness_data["max_iterations"])
vanilla_perceptron_truthfulness_saved.load(vanilla_perceptron_truthfulness_data)


##### Test Loaded Model


In [110]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


(320, 7675)

In [111]:
y_pred_sentiment = vanilla_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_train_sentiment, vanilla_perceptron_sentiment_saved.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       480
           1       0.99      0.99      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.88      0.97      0.92       160
           1       0.97      0.86      0.91       160

    accuracy                           0.92       320
   macro avg       0.92      0.92      0.92       320
weighted avg       0.92      0.92      0.92       320

---------------------------------------------------------


In [112]:
y_pred_truthfulness = vanilla_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(
    y_train_truthfulness, vanilla_perceptron_truthfulness_saved.predict(X_train_tf_idf_vectors), title="Train"
)
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.98      0.97      0.98       480
           1       0.97      0.98      0.98       480

    accuracy                           0.98       960
   macro avg       0.98      0.98      0.98       960
weighted avg       0.98      0.98      0.98       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.88      0.77      0.82       160
           1       0.79      0.89      0.84       160

    accuracy                           0.83       320
   macro avg       0.84      0.83      0.83       320
weighted avg       0.84      0.83      0.83       320

---------------------------------------------------------


#### Write Predictions


In [None]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [None]:
store_predictions(OUTPUT_FILE_PATH, output)


--------------------------------------------------------- **\*\*\*\***\*\***\*\*\*\***\*\*\***\*\*\*\***\*\***\*\*\*\*** -------------------------------------------------


## Averaged Perceptron


In [38]:
class AveragedPerceptron:
    def __init__(
        self,
        max_iterations: int,
        learning_rate: float = 1e-2,
        tolerance: float = 1e-2,
        val_ratio: float = 0.2,
        shuffle: bool = True,
        class_weights: dict = None,
        debug: bool = False,
        debug_at: int = 50,
        score_func: Callable[[npt.NDArray, npt.NDArray], float] = calculate_f1_score,
    ) -> None:
        self.type = TYPE_AVERAGED_PERCEPTRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.val_ratio = val_ratio
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.debug = debug
        self.debug_at = debug_at
        self.calculate_score = score_func
        self.best_epoch = 0

    def fit(
        self,
        X: npt.NDArray,
        y: npt.NDArray,
    ):
        n_epoch = 0

        self.weights: npt.NDArray = np.random.rand(X.shape[-1])
        self.bias: float = 0.0

        c = 1
        self.cache = {"weights": np.zeros(shape=(X.shape[-1],)), "bias": 0.0}

        X_train, X_val, y_train, y_val = train_test_split(X, y, self.val_ratio)
        best_val_score = -1

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.random.permutation(X.shape[0])
                X = X[idxs]
                y = y[idxs]

            for x, y_true in zip(X_train, y_train):

                a = np.dot(self.weights, x) + self.bias
                if y_true * a <= 0:
                    if self.class_weights is None:
                        self.weights = self.weights + y_true * x * self.learning_rate
                    else:
                        self.weights = self.weights + y_true * x * self.class_weights[y_true] * self.learning_rate
                    self.bias = self.bias + y_true

                self.cache["weights"] = self.cache["weights"] + y_true * c * x * self.learning_rate
                self.cache["bias"] = self.cache["bias"] + y_true * c

            if self.val_ratio != 0:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score
                    self.best_epoch = n_epoch

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)

        return self.best_epoch, best_val_score

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(np.dot(self.weights, x) + self.bias)
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"type": self.type, "max_iterations": self.best_epoch, "weights": self.weights.tolist(), "bias": float(self.bias)}

    def load(self, model_data: Dict[str, Any]):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]


### Sentiment Classification


In [44]:
averaged_perceptron_sentiment = AveragedPerceptron(
    max_iterations=2000,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.2,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)

best_epoch, val_score = averaged_perceptron_sentiment.fit(X_train_tf_idf_vectors, y_train_sentiment)

del averaged_perceptron_sentiment


Epoch # 50  Train:  0.3583959899749373  Val:  0.3685844141447733
Epoch # 100  Train:  0.4176368259065971  Val:  0.413772046409713
Epoch # 150  Train:  0.5495051453856874  Val:  0.5141536489085328
Epoch # 200  Train:  0.6675324675324676  Val:  0.6042492468685587
Epoch # 250  Train:  0.7673729016213113  Val:  0.6556315549475604
Epoch # 300  Train:  0.8530265179139448  Val:  0.7444444444444445
Epoch # 350  Train:  0.8936729890627163  Val:  0.7871699400278543
Epoch # 400  Train:  0.9294765840220386  Val:  0.7998591350894493
Epoch # 450  Train:  0.9504724666014988  Val:  0.8171838243530861
Epoch # 500  Train:  0.97265397821203  Val:  0.823524496560176
Epoch # 550  Train:  0.9804687168862523  Val:  0.8291624958291626
Epoch # 600  Train:  0.9856764762232557  Val:  0.8403547671840355
Epoch # 650  Train:  0.9895826975116482  Val:  0.8459111664591117
Epoch # 700  Train:  0.9895822029300054  Val:  0.868215138785932
Epoch # 750  Train:  0.9921861752277337  Val:  0.868215138785932
Epoch # 800  Trai

In [45]:
best_epoch, val_score


(1648, 0.911398246423627)

In [46]:
averaged_perceptron_sentiment = AveragedPerceptron(
    max_iterations=1648,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.0,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)
averaged_perceptron_sentiment.fit(X_train_tf_idf_vectors, y_train_sentiment)

averaged_perceptron_sentiment_data = averaged_perceptron_sentiment.export()


In [47]:
y_dev_sentiment_pred = averaged_perceptron_sentiment.predict(X_dev_tf_idf_vectors)

calculate_scores(y_train_sentiment, averaged_perceptron_sentiment.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       480

    accuracy                           1.00       960
   macro avg       1.00      1.00      1.00       960
weighted avg       1.00      1.00      1.00       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.86      0.91       160
           1       0.87      0.97      0.92       160

    accuracy                           0.91       320
   macro avg       0.92      0.91      0.91       320
weighted avg       0.92      0.91      0.91       320

---------------------------------------------------------


In [48]:
neg_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
pos_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")


### Truthful Classification


In [49]:
averaged_perceptron_truthfulness = AveragedPerceptron(
    max_iterations=2000,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.2,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)

best_epoch, best_val_score = averaged_perceptron_truthfulness.fit(X_train_tf_idf_vectors, y_train_truthfulness)

del averaged_perceptron_truthfulness


Epoch # 50  Train:  0.33506493506493507  Val:  0.3263157894736842
Epoch # 100  Train:  0.5764059287266491  Val:  0.5390377412849323
Epoch # 150  Train:  0.6624326720355208  Val:  0.5636363636363636
Epoch # 200  Train:  0.8035530326921057  Val:  0.5795269678388186
Epoch # 250  Train:  0.7695679576745879  Val:  0.6026563706563707
Epoch # 300  Train:  0.9082562544796751  Val:  0.6176353442201047
Epoch # 350  Train:  0.9373772652836574  Val:  0.6322845417236662
Epoch # 400  Train:  0.9320478877265581  Val:  0.63671875
Epoch # 450  Train:  0.9595925994443304  Val:  0.63671875
Epoch # 500  Train:  0.9596238478432304  Val:  0.7948717948717949
Epoch # 550  Train:  0.9212937342433745  Val:  0.8488558321344228
Epoch # 600  Train:  0.9292557111274871  Val:  0.827069516089413
Epoch # 650  Train:  0.9358690113650501  Val:  0.8213464696223316
Epoch # 700  Train:  0.9542968291724971  Val:  0.8209741114523914
Epoch # 750  Train:  0.9647787204769548  Val:  0.8369449086376461
Epoch # 800  Train:  0.9700

In [51]:
best_epoch, best_val_score


(941, 0.8853046594982079)

In [52]:
averaged_perceptron_truthfulness = AveragedPerceptron(
    max_iterations=941,
    learning_rate=0.815,
    tolerance=1e-8,
    shuffle=True,
    val_ratio=0.0,
    debug=True,
    debug_at=50,
    score_func=partial(calculate_f1_score, average="macro"),
)
averaged_perceptron_truthfulness.fit(
    X_train_tf_idf_vectors,
    y_train_truthfulness,
)

averaged_perceptron_truthfulness_data = averaged_perceptron_truthfulness.export()


In [53]:
y_dev_truthfulness_pred = averaged_perceptron_truthfulness.predict(X_dev_tf_idf_vectors)

calculate_scores(y_train_truthfulness, averaged_perceptron_truthfulness.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.92      1.00      0.96       480
           1       1.00      0.92      0.96       480

    accuracy                           0.96       960
   macro avg       0.96      0.96      0.96       960
weighted avg       0.96      0.96      0.96       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.78      0.92      0.84       160
           1       0.90      0.74      0.82       160

    accuracy                           0.83       320
   macro avg       0.84      0.83      0.83       320
weighted avg       0.84      0.83      0.83       320

---------------------------------------------------------


#### Averaging the Scores


In [54]:
fake_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")
truth_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [55]:
neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score


(0.9072847682119205, 0.9171597633136095, 0.8448275862068966, 0.815068493150685)

In [56]:
np.mean([neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score])


0.8710851527207779

#### Write Averaged Models


In [57]:
averaged_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": averaged_perceptron_sentiment_data,
    "truthfulness_classifier": averaged_perceptron_truthfulness_data,
}

store_model(AVERAGED_MODEL_FILE_PATH, averaged_model_file_data)


#### Test Loaded Model


In [58]:
tf_idf_model_data, averaged_perceptron_sentiment_data, averaged_perceptron_truthfulness_data = load_model(
    AVERAGED_MODEL_FILE_PATH
)


In [59]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

averaged_perceptron_sentiment_saved = AveragedPerceptron(averaged_perceptron_sentiment_data["max_iterations"])
averaged_perceptron_sentiment_saved.load(averaged_perceptron_sentiment_data)

averaged_perceptron_truthfulness_saved = AveragedPerceptron(averaged_perceptron_truthfulness_data["max_iterations"])
averaged_perceptron_truthfulness_saved.load(averaged_perceptron_truthfulness_data)


In [60]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


(320, 7675)

In [61]:
y_pred_sentiment = averaged_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_train_sentiment, averaged_perceptron_sentiment_saved.predict(X_train_tf_idf_vectors), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       480

    accuracy                           1.00       960
   macro avg       1.00      1.00      1.00       960
weighted avg       1.00      1.00      1.00       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.86      0.91       160
           1       0.87      0.97      0.92       160

    accuracy                           0.91       320
   macro avg       0.92      0.91      0.91       320
weighted avg       0.92      0.91      0.91       320

---------------------------------------------------------


In [62]:
y_pred_truthfulness = averaged_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(
    y_train_truthfulness, averaged_perceptron_truthfulness_saved.predict(X_train_tf_idf_vectors), title="Train"
)
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.92      1.00      0.96       480
           1       1.00      0.92      0.96       480

    accuracy                           0.96       960
   macro avg       0.96      0.96      0.96       960
weighted avg       0.96      0.96      0.96       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.78      0.92      0.84       160
           1       0.90      0.74      0.82       160

    accuracy                           0.83       320
   macro avg       0.84      0.83      0.83       320
weighted avg       0.84      0.83      0.83       320

---------------------------------------------------------


#### Write Predictions


In [None]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [None]:
store_predictions(OUTPUT_FILE_PATH, output)
