In [244]:
import re

import json
from typing import List, Dict, Tuple, Set, Any, Callable, Optional
from functools import partial

import numpy as np
import numpy.typing as npt


# Define Constants


In [245]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"
OUTPUT_PATH = "./output"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"
OUTPUT_FILENAME = "percepoutput.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

TYPE_VANILLA_PERPCETRON = "vanilla_perceptron"
TYPE_AVERAGED_PERCEPTRON = "averaged_perceptron"

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

OUTPUT_FILE_PATH = f"{OUTPUT_PATH}/{OUTPUT_FILENAME}"

DEV_DATA_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_KEY_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_ID_COL = 0
TRAIN_DATA_COL = 3
DEV_DATA_COL = 1
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1

VAL_SIZE = 0.2


In [246]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [247]:
dev_line = "07Zfn0z If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [248]:
dev_regex = re.compile(r"(\w*) (.*)\n?")


In [249]:
re.match(dev_regex, dev_line).groups()


('07Zfn0z',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [250]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [251]:
input_regex = re.compile(r"(\w*) (\w*) (\w*) (.*)\n?")


In [252]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [253]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [254]:
np.array(data)[0, 1]


'Fake'

In [255]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = r"(\w*) (\w*) (\w*) (.*)\n?"
    if type == "DEV":
        regex = r"(\w*) (.*)\n?"
    elif type == "KEY":
        regex = r"(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [256]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [257]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [258]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)

    return (model_data["tf_idf_model"], model_data["sentiment_classifier"], model_data["truthfulness_classifier"])


In [259]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [260]:
def calculate_accuracy_score(y_true: npt.NDArray, y_pred: npt.NDArray):
    return (y_true == y_pred).sum() / y_true.shape[0]


In [261]:
def calculate_f1_score(y_true: npt.NDArray, y_pred: npt.NDArray, average: str = "macro"):
    def calculate_f1(y_true, y_pred, label):
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_pred != label) & (y_true == label))

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def macro_f1(y_true, y_pred):
        return np.mean([calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)])

    def micro_f1(y_true, y_pred):
        return {label: calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)}

    if average == "macro":
        return macro_f1(y_true, y_pred)
    elif average == "micro":
        return micro_f1(y_true, y_pred)
    else:
        return {"micro": micro_f1(y_true, y_pred), "macro": macro_f1(y_true, y_pred)}


In [262]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("---------------------------------------------------------")


In [263]:
# Learning Rate Scheduler
def learning_rate_scheduler(learning_rate: float, epoch: int, decay: float = 1e-2):
    return learning_rate * 1 / (1 + decay * epoch)


# Load Data


In [264]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")

dev_raw_data = load_data(DEV_DATA_FILE_PATH, type="DEV")
dev_key_data = load_data(DEV_KEY_FILE_PATH, type="KEY")


# Data Cleaning


In [265]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [266]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [267]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [268]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [269]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [270]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [271]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [272]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [273]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    from contractions import fix

    def contraction_fixer(txt: str):
        return " ".join([fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [274]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    TRAIN_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [275]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


In [276]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DEV_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

dev_cleaned_data = dev_raw_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = dev_cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    dev_cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


# Data Preprocessing


Not Applicable since everything has to be implemented from scratch.


# Feature Extraction


In [277]:
class TfIdf:
    # Implement low frequency terms and other techniques
    def __init__(self) -> None:
        self.n_docs: int = None
        self.vocab: List = list()
        self.vocab_size: int = None
        self.vocab_index: Dict[str, int] = dict()
        self.word_document_count: Dict[str, int] = dict()

    def __create_vocab__(self, documents: npt.NDArray) -> Set:
        vocab = set()

        for document in documents:
            for word in document:
                vocab.add(word)

        return list(vocab)

    def __get_word_document_count__(self, documents: npt.NDArray):
        word_document_count = dict()

        for document in documents:
            for word in document:
                if word in self.vocab:
                    if word not in word_document_count:
                        word_document_count[word] = 1
                    else:
                        word_document_count[word] += 1

        return word_document_count

    def __term_frequency__(self, word: str, document: npt.NDArray):
        word_occurences = (document == word).sum()
        return word_occurences / self.n_docs

    def __inverse_document_frequency__(self, word: str):
        word_occurrences = 1

        if word in self.word_document_count:
            word_occurrences += self.word_document_count[word]

        return np.log(self.n_docs / word_occurrences)

    def __tf_idf__(self, document: npt.NDArray):
        tf_idf_vector = np.zeros(shape=(self.vocab_size,))
        for word in document:
            # ignore word not in vocab
            if word in self.vocab:
                tf = self.__term_frequency__(word, document)
                idf = self.__inverse_document_frequency__(word)

                tf_idf_vector[self.vocab_index[word]] = tf * idf
        return tf_idf_vector

    def fit(self, documents: npt.NDArray):
        self.n_docs = documents.shape[0]
        self.vocab = self.__create_vocab__(documents)
        self.vocab_size = len(self.vocab)
        self.vocab_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.word_document_count = self.__get_word_document_count__(documents)

    def transform(self, documents: npt.NDArray):
        tf_idf_vectors = list()
        for document in documents:
            tf_idf_vectors.append(self.__tf_idf__(document))
        return np.array(tf_idf_vectors)

    def export(self):
        return {
            "n_docs": self.n_docs,
            "vocab_size": self.vocab_size,
            "vocab": self.vocab,
            "vocab_size": self.vocab_size,
            "vocab_index": self.vocab_index,
            "word_document_count": self.word_document_count,
        }

    def load(self, tf_idf_model_data):
        self.n_docs = tf_idf_model_data["n_docs"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab = tf_idf_model_data["vocab"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab_index = tf_idf_model_data["vocab_index"]
        self.word_document_count = tf_idf_model_data["word_document_count"]


In [278]:
def tokenize(data: npt.NDArray):
    tokenized_documents = list()
    for document in data:
        tokenized_documents.append(np.array(document.split()))
    return np.array(tokenized_documents, dtype=object)


In [279]:
final_data = load_data(CLEANED_DATA_FILE_PATH)


In [280]:
train_tokenized = tokenize(final_data[:, TRAIN_DATA_COL])
dev_tokenized = tokenize(dev_cleaned_data[:, DEV_DATA_COL])


In [281]:
tf_idf_model = TfIdf()
tf_idf_model.fit(train_tokenized)


In [282]:
X_all = tf_idf_model.transform(train_tokenized)
X_dev = tf_idf_model.transform(dev_tokenized)

tf_idf_model_data = tf_idf_model.export()


In [283]:
y_all_sentiment = np.where(final_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_all_truthfulness = np.where(final_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)

y_dev_sentiment = np.where(dev_key_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_dev_truthfulness = np.where(dev_key_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)


# Split Data


In [284]:
def train_test_split(
    X: npt.NDArray,
    y_sentiment: npt.NDArray,
    y_truthfulness: npt.NDArray,
    test_size: float = 0.2,
    rng=np.random.default_rng(seed=RANDOM_SEED),
):
    n_max = X.shape[0]
    sample = int((1 - test_size) * n_max)

    # Shuffle the data
    all_idx = np.arange(n_max)
    rng.shuffle(all_idx)

    train_idx, test_idx = all_idx[:sample], all_idx[sample:]

    X_train, X_test, y_train_sentiment, y_test_sentiment, y_train_truthfulness, y_test_truthfulness = (
        X[train_idx],
        X[test_idx],
        y_sentiment[train_idx],
        y_sentiment[test_idx],
        y_truthfulness[train_idx],
        y_truthfulness[test_idx],
    )

    return X_train, X_test, y_train_sentiment, y_test_sentiment, y_train_truthfulness, y_test_truthfulness


In [285]:
X_train, X_val, y_train_sentiment, y_val_sentiment, y_train_truthfulness, y_val_truthfulness = train_test_split(
    X_all, y_all_sentiment, y_all_truthfulness, VAL_SIZE
)


# Perceptron Models


## Vanilla Perceptron


In [286]:
class VanillaPerceptron:
    def __init__(
        self,
        max_iterations: int = 1000,
        learning_rate: float = 1e-2,
        shuffle: bool = True,
        class_weights = None,
        lr_scheduler_func = None,
        score_func = calculate_f1_score,
        rng=np.random.default_rng(seed=RANDOM_SEED),
        debug: bool = False,
        debug_at: int = 50,
    ) -> None:
        self.type = TYPE_VANILLA_PERPCETRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.learning_rate_scheduler = lr_scheduler_func
        self.calculate_score = score_func
        self.rng = rng
        self.debug = debug
        self.debug_at = debug_at

        self.best_epoch = 0

    def fit(
        self,
        X_train,
        y_train,
        X_val = None,
        y_val = None,
    ):
        n_epoch = 0

        self.weights = self.rng.random(size=(X_train.shape[-1],))
        self.bias: float = 0.0

        learning_rate = self.learning_rate

        best_epoch = 0
        best_val_score = -1
        best_weights = self.weights.copy()
        best_bias = self.bias

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.arange(X_train.shape[0])
                self.rng.shuffle(idxs)

                X_train = X_train[idxs]
                y_train = y_train[idxs]

            for x, y_true in zip(X_train, y_train):

                if y_true * self._activation(x) <= 0:
                    if self.class_weights is None:
                        self.weights = self.weights + y_true * x * self.learning_rate
                    else:
                        self.weights = self.weights + y_true * x * self.learning_rate * self.class_weights[y_true]
                    self.bias = self.bias + y_true

            if X_val is not None and y_val is not None:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score

                    # Record the current best wegiths and bias
                    best_epoch = n_epoch
                    best_weights = self.weights
                    best_bias = self.bias

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)
            else:
                best_epoch = n_epoch
                best_weights = self.weights
                best_bias = self.bias

            # Update learning rate
            if self.learning_rate_scheduler:
                learning_rate = self.learning_rate_scheduler(learning_rate, n_epoch)

        # Set the best weights and bias found if Val provided
        self.best_epoch = best_epoch
        self.weights = best_weights
        self.bias = best_bias

    def _activation(self, x):
        return np.dot(self.weights, x) + self.bias

    def predict(self, X):
        predictions = list()
        for x in X:
            pred = np.sign(self._activation(x))
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {
            "type": self.type,
            "max_iterations": self.max_iterations,
            "weights": self.weights.tolist(),
            "bias": float(self.bias),
            "best_epoch": self.best_epoch,
        }

    def load(self, model_data):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]
        self.best_epoch = model_data["best_epoch"]


## Sentiment Classification


In [287]:
vanilla_perceptron_sentiment = VanillaPerceptron(
    max_iterations=2000,
    learning_rate=0.815,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

vanilla_perceptron_sentiment.fit(X_all, y_all_sentiment, X_dev, y_dev_sentiment)

# Export the model
vanilla_perceptron_sentiment_data = vanilla_perceptron_sentiment.export()


  precision = tp / (tp + fp)


Epoch # 50  Train:  0.5617643363836851  Val:  0.5095361567271679
Epoch # 100  Train:  0.5708063957080639  Val:  0.5186511036742499
Epoch # 150  Train:  0.6962993540010654  Val:  0.5826587515740241
Epoch # 200  Train:  0.5134502923976608  Val:  0.49220785372554593
Epoch # 250  Train:  0.885136077085284  Val:  0.7780588307290605
Epoch # 300  Train:  0.9193505273528491  Val:  0.8026668284151679
Epoch # 350  Train:  0.9383537714235337  Val:  0.8130238555770471
Epoch # 400  Train:  0.9614181019922032  Val:  0.8266730861819933
Epoch # 450  Train:  0.9645487129358097  Val:  0.8232772037633922
Epoch # 500  Train:  0.9729024357409926  Val:  0.8164528301886793
Epoch # 550  Train:  0.9718559135698579  Val:  0.81259340037966
Epoch # 600  Train:  0.9812447902195054  Val:  0.802186888801062
Epoch # 650  Train:  0.960360924216346  Val:  0.7951864758053866
Epoch # 700  Train:  0.9540787705634483  Val:  0.7951864758053866
Epoch # 750  Train:  0.9895826099034655  Val:  0.7986931287795771
Epoch # 800  Tr

In [288]:
vanilla_perceptron_sentiment_data["best_epoch"]


1327

In [289]:
# Classify and predict with the best model
y_dev_sentiment_pred = vanilla_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")

del vanilla_perceptron_sentiment


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.97      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.97      0.93      0.95       160
           1       0.93      0.97      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


In [290]:
vanilla_perceptron_sentiment = VanillaPerceptron()
vanilla_perceptron_sentiment.load(vanilla_perceptron_sentiment_data)

y_dev_sentiment_pred = vanilla_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.97      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.97      0.93      0.95       160
           1       0.93      0.97      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


## Truthfulness Classification


In [345]:
vanilla_perceptron_truthfulness = VanillaPerceptron(
    max_iterations=900,
    learning_rate=3.2,
    shuffle=True,
    class_weights={-1: 1.04275, 1: 1.0},
    score_func=partial(calculate_f1_score, average="macro"),
    rng=np.random.default_rng(seed=RANDOM_SEED),
    debug=True,
    debug_at=50,
)

vanilla_perceptron_truthfulness.fit(X_all, y_all_truthfulness, X_dev, y_dev_truthfulness)

# Export the model
vanilla_perceptron_truthfulness_data = vanilla_perceptron_truthfulness.export()

del vanilla_perceptron_truthfulness


Epoch # 50  Train:  0.7544590885578256  Val:  0.6302335878357217
Epoch # 100  Train:  0.9054555641151341  Val:  0.6794297223481773
Epoch # 150  Train:  0.3627391753647064  Val:  0.35383467736749363
Epoch # 200  Train:  0.9572210647129213  Val:  0.6620836044415527
Epoch # 250  Train:  0.9979166576243821  Val:  0.6620836044415527
Epoch # 300  Train:  0.9916660879227723  Val:  0.8184947582537945
Epoch # 350  Train:  0.9011354523184525  Val:  0.68101802757158
Epoch # 400  Train:  0.9739406607128315  Val:  0.6729688298415942
Epoch # 450  Train:  1.0  Val:  0.6702047408068509
Epoch # 500  Train:  1.0  Val:  0.6702047408068509
Epoch # 550  Train:  1.0  Val:  0.6702047408068509
Epoch # 600  Train:  1.0  Val:  0.6702047408068509
Epoch # 650  Train:  1.0  Val:  0.6702047408068509
Epoch # 700  Train:  1.0  Val:  0.6702047408068509
Epoch # 750  Train:  1.0  Val:  0.6702047408068509
Epoch # 800  Train:  1.0  Val:  0.6702047408068509
Epoch # 850  Train:  1.0  Val:  0.6702047408068509
Epoch # 900  Tr

In [346]:
vanilla_perceptron_truthfulness_data["best_epoch"]


293

In [347]:
vanilla_perceptron_truthfulness = VanillaPerceptron()
vanilla_perceptron_truthfulness.load(vanilla_perceptron_truthfulness_data)

y_dev_truthfulness_pred = vanilla_perceptron_truthfulness.predict(X_dev)

calculate_scores(y_all_truthfulness, vanilla_perceptron_truthfulness.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.97      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.80      0.88      0.84       160
           1       0.87      0.78      0.82       160

    accuracy                           0.83       320
   macro avg       0.83      0.83      0.83       320
weighted avg       0.83      0.83      0.83       320

---------------------------------------------------------


#### Averaging the Scores


In [185]:
sentiment_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
truthfulness_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [186]:
np.mean([sentiment_f1_score, truthfulness_f1_score])


0.8937126333476613

#### Write Vanilla Models


In [None]:
vanilla_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": vanilla_perceptron_sentiment_data,
    "truthfulness_classifier": vanilla_perceptron_truthfulness_data,
}

store_model(VANILLA_MODEL_FILE_PATH, vanilla_model_file_data)


#### Load Vanilla Models


In [None]:
tf_idf_model_data, vanilla_perceptron_sentiment_data, vanilla_perceptron_truthfulness_data = load_model(
    VANILLA_MODEL_FILE_PATH
)


In [None]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

vanilla_perceptron_sentiment_saved = VanillaPerceptron()
vanilla_perceptron_sentiment_saved.load(vanilla_perceptron_sentiment_data)

vanilla_perceptron_truthfulness_saved = VanillaPerceptron()
vanilla_perceptron_truthfulness_saved.load(vanilla_perceptron_truthfulness_data)


##### Test Loaded Model


In [None]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


In [None]:
y_pred_sentiment = vanilla_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment_saved.predict(X_all), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


In [None]:
y_pred_truthfulness = vanilla_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_truthfulness, vanilla_perceptron_truthfulness_saved.predict(X_all), title="Train")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


#### Write Predictions


In [None]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [None]:
store_predictions(OUTPUT_FILE_PATH, output)


--------------------------------------------------------- **\*\*\*\***\*\***\*\*\*\***\*\*\***\*\*\*\***\*\***\*\*\*\*** -------------------------------------------------


## Averaged Perceptron


In [187]:
class AveragedPerceptron:
    def __init__(
        self,
        max_iterations: int = 1000,
        learning_rate: float = 1e-2,
        shuffle: bool = True,
        class_weights = None,
        lr_scheduler_func = None,
        score_func = calculate_f1_score,
        rng=np.random.default_rng(seed=RANDOM_SEED),
        debug: bool = False,
        debug_at: int = 50,
    ) -> None:
        self.type = TYPE_AVERAGED_PERCEPTRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.learning_rate_scheduler = lr_scheduler_func
        self.calculate_score = score_func
        self.rng = rng
        self.debug = debug
        self.debug_at = debug_at

        self.best_epoch = 0

    def fit(
        self,
        X_train,
        y_train,
        X_val = None,
        y_val = None,
    ):
        n_epoch = 0

        self.weights = None
        self.bias = None
        current_weights = self.rng.random(size=(X_train.shape[-1],))
        current_bias: float = 0.0
        self.cache = {"weights": current_weights, "bias": current_bias}

        c = 1

        best_val_score = -1
        best_epoch = 0
        best_weights = current_weights.copy()
        best_bias = current_bias
        best_cache = self.cache

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.arange(X_train.shape[0])
                self.rng.shuffle(idxs)

                X_train = X_train[idxs]
                y_train = y_train[idxs]

            for x, y_true in zip(X_train, y_train):

                a = np.dot(current_weights, x) + current_bias
                if y_true * a <= 0:
                    if self.class_weights is None:
                        current_weights = current_weights + y_true * x * self.learning_rate
                        self.cache["weights"] = self.cache["weights"] + y_true * c * x * self.learning_rate
                    else:
                        current_weights = current_weights + y_true * x * self.learning_rate * self.class_weights[y_true]
                        self.cache["weights"] = (
                            self.cache["weights"] + y_true * c * x * self.learning_rate * self.class_weights[y_true]
                        )

                    current_bias = current_bias + y_true
                    self.cache["bias"] = self.cache["bias"] + y_true * c

                c += 1

            self.weights = current_weights - (1 / c) * self.cache["weights"]
            self.bias = current_bias - (1 / c) * self.cache["bias"]

            if X_val is not None and y_val is not None:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score

                    best_epoch = n_epoch
                    best_weights = self.weights
                    best_bias = self.bias
                    best_cache = self.cache

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)

            # Update learning rate
            if self.learning_rate_scheduler:
                learning_rate = self.learning_rate_scheduler(learning_rate, n_epoch)

        # Set best epochs, weight, bias and cache
        if  X_val is not None and y_val is not None:
            self.best_epoch = best_epoch
            self.weights = best_weights
            self.bias = best_bias
            self.cache = best_cache

    def _activation(self, x):
        return np.dot(self.weights, x) + self.bias

    def predict(self, X):
        predictions = list()
        for x in X:
            pred = np.sign(self._activation(x))
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {
            "type": self.type,
            "max_iterations": self.max_iterations,
            "weights": self.weights.tolist(),
            "bias": float(self.bias),
            "best_epoch": self.best_epoch,
        }

    def load(self, model_data):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]
        self.best_epoch = model_data["best_epoch"]


### Sentiment Classification


In [None]:
averaged_perceptron_sentiment = AveragedPerceptron(
    max_iterations=2000,
    learning_rate=3,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

averaged_perceptron_sentiment.fit(X_all, y_all_sentiment, X_dev, y_dev_sentiment)

averaged_perceptron_sentiment_data = averaged_perceptron_sentiment.export()

In [None]:
averaged_perceptron_sentiment_data["best_epoch"]

In [None]:
y_dev_sentiment_pred = averaged_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, averaged_perceptron_sentiment.predict(X_all), title="All")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


### Truthful Classification


In [214]:
averaged_perceptron_truthfulness = AveragedPerceptron(
    max_iterations=800,
    learning_rate=815e-2,
    shuffle=True,
    class_weights={-1:1.0095, 1:1.0},
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

averaged_perceptron_truthfulness.fit(X_all, y_all_truthfulness, X_dev, y_dev_truthfulness)
averaged_perceptron_truthfulness_data = averaged_perceptron_truthfulness.export()


  precision = tp / (tp + fp)


Epoch # 50  Train:  0.7826047573325603  Val:  0.656089900771085
Epoch # 100  Train:  0.9404294695588276  Val:  0.7811730686569497
Epoch # 150  Train:  0.9718559135698579  Val:  0.815406877267083
Epoch # 200  Train:  0.9822861118558228  Val:  0.8343345543345543
Epoch # 250  Train:  0.9822861118558228  Val:  0.8305882352941176
Epoch # 300  Train:  0.9822861118558228  Val:  0.8236498189261534
Epoch # 350  Train:  0.9843711843711844  Val:  0.8164266497863586
Epoch # 400  Train:  0.9854135645167592  Val:  0.8031746031746032
Epoch # 450  Train:  0.9854135645167592  Val:  0.7961458831024049
Epoch # 500  Train:  0.9854135645167592  Val:  0.7860087233384235
Epoch # 550  Train:  0.986455849651249  Val:  0.7791977919779198
Epoch # 600  Train:  0.9874980465697765  Val:  0.77137106918239
Epoch # 650  Train:  0.9874980465697765  Val:  0.7673573246092331
Epoch # 700  Train:  0.9874980465697765  Val:  0.7673573246092331
Epoch # 750  Train:  0.9874980465697765  Val:  0.7673573246092331
Epoch # 800  Tra

In [215]:
averaged_perceptron_truthfulness_data["best_epoch"]


197

In [216]:
y_dev_truthfulness_pred = averaged_perceptron_truthfulness.predict(X_dev)

calculate_scores(y_all_truthfulness, averaged_perceptron_truthfulness.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")

del averaged_perceptron_truthfulness


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98       480
           1       1.00      0.96      0.98       480

    accuracy                           0.98       960
   macro avg       0.98      0.98      0.98       960
weighted avg       0.98      0.98      0.98       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.83      0.85      0.84       160
           1       0.85      0.82      0.84       160

    accuracy                           0.84       320
   macro avg       0.84      0.84      0.84       320
weighted avg       0.84      0.84      0.84       320

---------------------------------------------------------


#### Averaging the Scores


In [None]:
sentiment_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
truthfulness_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [None]:
np.mean([sentiment_f1_score, truthfulness_f1_score])


#### Write Averaged Models


In [None]:
averaged_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": averaged_perceptron_sentiment_data,
    "truthfulness_classifier": averaged_perceptron_truthfulness_data,
}

store_model(AVERAGED_MODEL_FILE_PATH, averaged_model_file_data)


#### Test Loaded Model


In [None]:
tf_idf_model_data, averaged_perceptron_sentiment_data, averaged_perceptron_truthfulness_data = load_model(
    AVERAGED_MODEL_FILE_PATH
)


In [None]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

averaged_perceptron_sentiment_saved = AveragedPerceptron()
averaged_perceptron_sentiment_saved.load(averaged_perceptron_sentiment_data)

averaged_perceptron_truthfulness_saved = AveragedPerceptron()
averaged_perceptron_truthfulness_saved.load(averaged_perceptron_truthfulness_data)


In [None]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


In [None]:
y_pred_sentiment = averaged_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_sentiment, averaged_perceptron_sentiment_saved.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


In [None]:
y_pred_truthfulness = averaged_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_truthfulness, averaged_perceptron_truthfulness_saved.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


#### Write Predictions


In [None]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [None]:
store_predictions(OUTPUT_FILE_PATH, output)
