In [1]:
import re

import json
from typing import List, Dict, Tuple, Set, Any, Callable, Optional
from functools import partial

import numpy as np
import numpy.typing as npt


# Define Constants


In [2]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"
OUTPUT_PATH = "./output"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"
OUTPUT_FILENAME = "output.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

TYPE_VANILLA_PERPCETRON = "vanilla_perceptron"
TYPE_AVERAGED_PERCEPTRON = "averaged_perceptron"

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

OUTPUT_FILE_PATH = f"{OUTPUT_PATH}/{OUTPUT_FILENAME}"

DEV_DATA_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_KEY_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_ID_COL = 0
TRAIN_DATA_COL = 3
DEV_DATA_COL = 1
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1

VAL_SIZE = 0.2


In [3]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [4]:
dev_line = "07Zfn0z If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [5]:
dev_regex = re.compile(r"(\w*) (.*)\n?")


In [6]:
re.match(dev_regex, dev_line).groups()


('07Zfn0z',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [7]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [8]:
input_regex = re.compile(r"(\w*) (\w*) (\w*) (.*)\n?")


In [9]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [10]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [11]:
np.array(data)[0, 1]


'Fake'

In [12]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = r"(\w*) (\w*) (\w*) (.*)\n?"
    if type == "DEV":
        regex = r"(\w*) (.*)\n?"
    elif type == "KEY":
        regex = r"(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [13]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [14]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [15]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)

    return (model_data["tf_idf_model"], model_data["sentiment_classifier"], model_data["truthfulness_classifier"])


In [16]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [17]:
def calculate_accuracy_score(y_true: npt.NDArray, y_pred: npt.NDArray):
    return (y_true == y_pred).sum() / y_true.shape[0]


In [18]:
def calculate_f1_score(y_true: npt.NDArray, y_pred: npt.NDArray, average: str = "macro"):
    def calculate_f1(y_true, y_pred, label):
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_pred != label) & (y_true == label))

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def macro_f1(y_true, y_pred):
        return np.mean([calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)])

    def micro_f1(y_true, y_pred):
        return {label: calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)}

    if average == "macro":
        return macro_f1(y_true, y_pred)
    elif average == "micro":
        return micro_f1(y_true, y_pred)
    else:
        return {"micro": micro_f1(y_true, y_pred), "macro": macro_f1(y_true, y_pred)}


In [19]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("---------------------------------------------------------")


In [20]:
# Learning Rate Scheduler
def learning_rate_scheduler(learning_rate: float, epoch: int, decay: float = 1e-2):
    return learning_rate * 1 / (1 + decay * epoch)


# Load Data


In [21]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")

dev_raw_data = load_data(DEV_DATA_FILE_PATH, type="DEV")
dev_key_data = load_data(DEV_KEY_FILE_PATH, type="KEY")


# Data Cleaning


In [22]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [23]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [24]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [25]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [26]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [27]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [28]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [29]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [30]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    from contractions import fix

    def contraction_fixer(txt: str):
        return " ".join([fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [31]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    TRAIN_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [32]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


In [33]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DEV_DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

dev_cleaned_data = dev_raw_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = dev_cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    dev_cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


# Data Preprocessing


Not Applicable since everything has to be implemented from scratch.


# Feature Extraction


In [34]:
class TfIdf:
    # Implement low frequency terms and other techniques
    def __init__(self) -> None:
        self.n_docs: int = None
        self.vocab: List = list()
        self.vocab_size: int = None
        self.vocab_index: Dict[str, int] = dict()
        self.word_document_count: Dict[str, int] = dict()

    def __create_vocab__(self, documents: npt.NDArray) -> Set:
        vocab = set()

        for document in documents:
            for word in document:
                vocab.add(word)

        return list(vocab)

    def __get_word_document_count__(self, documents: npt.NDArray):
        word_document_count = dict()

        for document in documents:
            for word in document:
                if word in self.vocab:
                    if word not in word_document_count:
                        word_document_count[word] = 1
                    else:
                        word_document_count[word] += 1

        return word_document_count

    def __term_frequency__(self, word: str, document: npt.NDArray):
        word_occurences = (document == word).sum()
        return word_occurences / self.n_docs

    def __inverse_document_frequency__(self, word: str):
        word_occurrences = 1

        if word in self.word_document_count:
            word_occurrences += self.word_document_count[word]

        return np.log(self.n_docs / word_occurrences)

    def __tf_idf__(self, document: npt.NDArray):
        tf_idf_vector = np.zeros(shape=(self.vocab_size,))
        for word in document:
            # ignore word not in vocab
            if word in self.vocab:
                tf = self.__term_frequency__(word, document)
                idf = self.__inverse_document_frequency__(word)

                tf_idf_vector[self.vocab_index[word]] = tf * idf
        return tf_idf_vector

    def fit(self, documents: npt.NDArray):
        self.n_docs = documents.shape[0]
        self.vocab = self.__create_vocab__(documents)
        self.vocab_size = len(self.vocab)
        self.vocab_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.word_document_count = self.__get_word_document_count__(documents)

    def transform(self, documents: npt.NDArray):
        tf_idf_vectors = list()
        for document in documents:
            tf_idf_vectors.append(self.__tf_idf__(document))
        return np.array(tf_idf_vectors)

    def export(self):
        return {
            "n_docs": self.n_docs,
            "vocab_size": self.vocab_size,
            "vocab": self.vocab,
            "vocab_size": self.vocab_size,
            "vocab_index": self.vocab_index,
            "word_document_count": self.word_document_count,
        }

    def load(self, tf_idf_model_data):
        self.n_docs = tf_idf_model_data["n_docs"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab = tf_idf_model_data["vocab"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab_index = tf_idf_model_data["vocab_index"]
        self.word_document_count = tf_idf_model_data["word_document_count"]


In [35]:
def tokenize(data: npt.NDArray):
    tokenized_documents = list()
    for document in data:
        tokenized_documents.append(np.array(document.split()))
    return np.array(tokenized_documents, dtype=object)


In [36]:
final_data = load_data(CLEANED_DATA_FILE_PATH)


In [37]:
train_tokenized = tokenize(final_data[:, TRAIN_DATA_COL])
dev_tokenized = tokenize(dev_cleaned_data[:, DEV_DATA_COL])


In [38]:
tf_idf_model = TfIdf()
tf_idf_model.fit(train_tokenized)


In [39]:
X_all = tf_idf_model.transform(train_tokenized)
X_dev = tf_idf_model.transform(dev_tokenized)

tf_idf_model_data = tf_idf_model.export()


In [40]:
y_all_sentiment = np.where(final_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_all_truthfulness = np.where(final_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)

y_dev_sentiment = np.where(dev_key_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
y_dev_truthfulness = np.where(dev_key_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)


# Split Data


In [41]:
def train_test_split(
    X: npt.NDArray,
    y_sentiment: npt.NDArray,
    y_truthfulness: npt.NDArray,
    test_size: float = 0.2,
    rng=np.random.default_rng(seed=RANDOM_SEED),
):
    n_max = X.shape[0]
    sample = int((1 - test_size) * n_max)

    # Shuffle the data
    all_idx = np.arange(n_max)
    rng.shuffle(all_idx)

    train_idx, test_idx = all_idx[:sample], all_idx[sample:]

    X_train, X_test, y_train_sentiment, y_test_sentiment, y_train_truthfulness, y_test_truthfulness = (
        X[train_idx],
        X[test_idx],
        y_sentiment[train_idx],
        y_sentiment[test_idx],
        y_truthfulness[train_idx],
        y_truthfulness[test_idx],
    )

    return X_train, X_test, y_train_sentiment, y_test_sentiment, y_train_truthfulness, y_test_truthfulness


In [42]:
X_train, X_val, y_train_sentiment, y_val_sentiment, y_train_truthfulness, y_val_truthfulness = train_test_split(
    X_all, y_all_sentiment, y_all_truthfulness, VAL_SIZE
)


# Perceptron Models


## Vanilla Perceptron


In [43]:
class VanillaPerceptron:
    def __init__(
        self,
        max_iterations: int = 1000,
        learning_rate: float = 1e-2,
        shuffle: bool = True,
        class_weights: dict = None,
        lr_scheduler_func: Optional[Callable[[float, int], float]] = None,
        score_func: Callable[[npt.NDArray, npt.NDArray], float] = calculate_f1_score,
        rng=np.random.default_rng(seed=RANDOM_SEED),
        debug: bool = False,
        debug_at: int = 50,
    ) -> None:
        self.type = TYPE_VANILLA_PERPCETRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.learning_rate_scheduler = lr_scheduler_func
        self.calculate_score = score_func
        self.rng = rng
        self.debug = debug
        self.debug_at = debug_at

        self.best_epoch = 0

    def fit(
        self,
        X_train: npt.NDArray,
        y_train: npt.NDArray,
        X_val: npt.NDArray = None,
        y_val: npt.NDArray = None,
    ):
        n_epoch = 0

        self.weights: npt.NDArray = self.rng.random(size=(X_train.shape[-1],))
        self.bias: float = 0.0

        learning_rate = self.learning_rate

        best_epoch = 0
        best_val_score = -1
        best_weights = self.weights.copy()
        best_bias = self.bias

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.arange(X_train.shape[0])
                self.rng.shuffle(idxs)

                X_train = X_train[idxs]
                y_train = y_train[idxs]

            for x, y_true in zip(X_train, y_train):

                if y_true * self._activation(x) <= 0:
                    if self.class_weights is None:
                        self.weights = self.weights + y_true * x * self.learning_rate
                    else:
                        self.weights = self.weights + y_true * x * self.class_weights[y_true] * self.learning_rate
                    self.bias = self.bias + y_true

            if X_val is not None and y_val is not None:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score

                    # Record the current best wegiths and bias
                    best_epoch = n_epoch
                    best_weights = self.weights
                    best_bias = self.bias

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)

            # Update learning rate
            if self.learning_rate_scheduler:
                learning_rate = self.learning_rate_scheduler(learning_rate, n_epoch)

        # Set the best weights and bias found
        self.best_epoch = best_epoch
        self.weights = best_weights
        self.bias = best_bias

    def _activation(self, x: npt.NDArray):
        return np.dot(self.weights, x) + self.bias

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(self._activation(x))
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {
            "type": self.type,
            "max_iterations": self.best_epoch,
            "weights": self.weights.tolist(),
            "bias": float(self.bias),
            "best_epoch": self.best_epoch,
        }

    def load(self, model_data: Dict[str, Any]):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]
        self.best_epoch = model_data["best_epoch"]


## Sentiment Classification


In [44]:
vanilla_perceptron_sentiment = VanillaPerceptron(
    max_iterations=2000,
    learning_rate=0.815,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

vanilla_perceptron_sentiment.fit(X_all, y_all_sentiment, X_dev, y_dev_sentiment)

# Export the model
vanilla_perceptron_sentiment_data = vanilla_perceptron_sentiment.export()


  precision = tp / (tp + fp)


Epoch # 50  Train:  0.5617643363836851  Val:  0.5095361567271679
Epoch # 100  Train:  0.5708063957080639  Val:  0.5186511036742499
Epoch # 150  Train:  0.6962993540010654  Val:  0.5826587515740241
Epoch # 200  Train:  0.5134502923976608  Val:  0.49220785372554593
Epoch # 250  Train:  0.885136077085284  Val:  0.7780588307290605
Epoch # 300  Train:  0.9193505273528491  Val:  0.8026668284151679
Epoch # 350  Train:  0.9383537714235337  Val:  0.8130238555770471
Epoch # 400  Train:  0.9614181019922032  Val:  0.8266730861819933
Epoch # 450  Train:  0.9645487129358097  Val:  0.8232772037633922
Epoch # 500  Train:  0.9729024357409926  Val:  0.8164528301886793
Epoch # 550  Train:  0.9718559135698579  Val:  0.81259340037966
Epoch # 600  Train:  0.9812447902195054  Val:  0.802186888801062
Epoch # 650  Train:  0.960360924216346  Val:  0.7951864758053866
Epoch # 700  Train:  0.9540787705634483  Val:  0.7951864758053866
Epoch # 750  Train:  0.9895826099034655  Val:  0.7986931287795771
Epoch # 800  Tr

In [45]:
vanilla_perceptron_sentiment_data["best_epoch"]


1327

In [46]:
# Classify and predict with the best model
y_dev_sentiment_pred = vanilla_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")

del vanilla_perceptron_sentiment


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.97      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.97      0.93      0.95       160
           1       0.93      0.97      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


In [47]:
vanilla_perceptron_sentiment = VanillaPerceptron()
vanilla_perceptron_sentiment.load(vanilla_perceptron_sentiment_data)

y_dev_sentiment_pred = vanilla_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.97      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.97      0.93      0.95       160
           1       0.93      0.97      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


## Truthfulness Classification


In [48]:
vanilla_perceptron_truthfulness = VanillaPerceptron(
    max_iterations=1000,
    learning_rate=20e-2,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

vanilla_perceptron_truthfulness.fit(X_all, y_all_truthfulness, X_dev, y_dev_truthfulness)

# Export the model
vanilla_perceptron_truthfulness_data = vanilla_perceptron_truthfulness.export()

del vanilla_perceptron_truthfulness


  precision = tp / (tp + fp)


Epoch # 50  Train:  nan  Val:  nan
Epoch # 100  Train:  0.3379463177880759  Val:  nan
Epoch # 150  Train:  0.4679020201044777  Val:  0.4791802791802792
Epoch # 200  Train:  0.4642054621453258  Val:  0.4791802791802792
Epoch # 250  Train:  0.3583024888867352  Val:  nan
Epoch # 300  Train:  0.3515889733492078  Val:  nan
Epoch # 350  Train:  0.6064952751290577  Val:  0.534324116831479
Epoch # 400  Train:  0.6226415094339623  Val:  0.5392842451665981
Epoch # 450  Train:  0.6643771761258777  Val:  0.558750395587504
Epoch # 500  Train:  0.6948022772517659  Val:  0.568267674042094
Epoch # 550  Train:  0.3627391753647064  Val:  nan
Epoch # 600  Train:  0.3560724989296418  Val:  nan
Epoch # 650  Train:  0.3605246976839517  Val:  nan
Epoch # 700  Train:  0.7835954376517205  Val:  0.5822873782508284
Epoch # 750  Train:  0.8079818769411944  Val:  0.5776475325488791
Epoch # 800  Train:  0.37152076195754996  Val:  nan
Epoch # 850  Train:  0.8387181116133863  Val:  0.5914697099212913
Epoch # 900  Tra

In [49]:
vanilla_perceptron_truthfulness_data["best_epoch"]


1000

In [50]:
vanilla_perceptron_truthfulness = VanillaPerceptron()
vanilla_perceptron_truthfulness.load(vanilla_perceptron_truthfulness_data)

y_dev_truthfulness_pred = vanilla_perceptron_truthfulness.predict(X_dev)

calculate_scores(y_all_truthfulness, vanilla_perceptron_truthfulness.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.79      0.88       480
           1       0.82      1.00      0.90       480

    accuracy                           0.89       960
   macro avg       0.91      0.89      0.89       960
weighted avg       0.91      0.89      0.89       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.32      0.48       160
           1       0.59      0.99      0.74       160

    accuracy                           0.65       320
   macro avg       0.78      0.65      0.61       320
weighted avg       0.78      0.65      0.61       320

---------------------------------------------------------


#### Averaging the Scores


In [51]:
sentiment_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
truthfulness_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [52]:
np.mean([sentiment_f1_score, truthfulness_f1_score])


0.779721227307537

#### Write Vanilla Models


In [53]:
vanilla_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": vanilla_perceptron_sentiment_data,
    "truthfulness_classifier": vanilla_perceptron_truthfulness_data,
}

store_model(VANILLA_MODEL_FILE_PATH, vanilla_model_file_data)


#### Load Vanilla Models


In [60]:
tf_idf_model_data, vanilla_perceptron_sentiment_data, vanilla_perceptron_truthfulness_data = load_model(
    # VANILLA_MODEL_FILE_PATH
    f"{MODEL_PATH}/py-vanillamodel.txt"
)


In [64]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

vanilla_perceptron_sentiment_saved = VanillaPerceptron()
vanilla_perceptron_sentiment_saved.load(vanilla_perceptron_sentiment_data)

vanilla_perceptron_truthfulness_saved = VanillaPerceptron()
vanilla_perceptron_truthfulness_saved.load(vanilla_perceptron_truthfulness_data)


##### Test Loaded Model


In [65]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


(320, 7675)

In [66]:
y_pred_sentiment = vanilla_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_sentiment, vanilla_perceptron_sentiment_saved.predict(X_all), title="Train")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       480
           1       0.50      0.99      0.66       480

    accuracy                           0.49       960
   macro avg       0.25      0.49      0.33       960
weighted avg       0.25      0.49      0.33       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.97      0.93      0.95       160
           1       0.93      0.97      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


In [67]:
y_pred_truthfulness = vanilla_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_truthfulness, vanilla_perceptron_truthfulness_saved.predict(X_all), title="Train")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       480
           1       0.50      1.00      0.67       480

    accuracy                           0.50       960
   macro avg       0.25      0.50      0.33       960
weighted avg       0.25      0.50      0.33       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.32      0.48       160
           1       0.59      0.99      0.74       160

    accuracy                           0.65       320
   macro avg       0.78      0.65      0.61       320
weighted avg       0.78      0.65      0.61       320

---------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Write Predictions


In [58]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [59]:
store_predictions(OUTPUT_FILE_PATH, output)


--------------------------------------------------------- **\*\*\*\***\*\***\*\*\*\***\*\*\***\*\*\*\***\*\***\*\*\*\*** -------------------------------------------------


## Averaged Perceptron


In [74]:
class AveragedPerceptron:
    def __init__(
        self,
        max_iterations: int = 1000,
        learning_rate: float = 1e-2,
        shuffle: bool = True,
        class_weights: dict = None,
        lr_scheduler_func: Optional[Callable[[float, int], float]] = None,
        score_func: Callable[[npt.NDArray, npt.NDArray], float] = calculate_f1_score,
        rng=np.random.default_rng(seed=RANDOM_SEED),
        debug: bool = False,
        debug_at: int = 50,
    ) -> None:
        self.type = TYPE_AVERAGED_PERCEPTRON
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
        self.shuffle = shuffle
        self.class_weights = class_weights
        self.learning_rate_scheduler = lr_scheduler_func
        self.calculate_score = score_func
        self.rng = rng
        self.debug = debug
        self.debug_at = debug_at

        self.best_epoch = 0

    def fit(
        self,
        X_train: npt.NDArray,
        y_train: npt.NDArray,
        X_val: npt.NDArray = None,
        y_val: npt.NDArray = None,
    ):
        n_epoch = 0

        self.weights = None
        self.bias = None
        current_weights: npt.NDArray = rng.random(size=(X_train.shape[-1],))
        current_bias: float = 0.0
        self.cache = {"weights": rng.random(size=(X_train.shape[-1],)), "bias": 0.0}

        c = 1

        best_val_score = -1
        best_epoch = 0
        best_weights = current_weights
        best_bias = current_bias
        best_cache = self.cache

        for n_epoch in range(1, self.max_iterations + 1):

            if self.shuffle:
                idxs = np.arange(X_train.shape[0])
                self.rng.shuffle(idxs)

                X_train = X_train[idxs]
                y_train = y_train[idxs]

            for x, y_true in zip(X_train, y_train):

                a = np.dot(current_weights, x) + current_bias
                if y_true * a <= 0:
                    if self.class_weights is None:
                        current_weights = current_weights + y_true * x * self.learning_rate
                        self.cache["weights"] = self.cache["weights"] + y_true * c * x * self.learning_rate
                    else:
                        current_weights = current_weights + y_true * x * self.learning_rate * self.class_weights[y_true]
                        self.cache["weights"] = (
                            self.cache["weights"] + y_true * c * x * self.learning_rate * self.class_weights[y_true]
                        )

                    current_bias = current_bias + y_true
                    self.cache["bias"] = self.cache["bias"] + y_true * c

                c += 1

            self.weights = current_weights - (1 / c) * self.cache["weights"]
            self.bias = current_bias - (1 / c) * self.cache["bias"]

            if X_val is not None and y_val is not None:
                train_score = self.calculate_score(y_train, self.predict(X_train))
                val_score = self.calculate_score(y_val, self.predict(X_val))

                if val_score > best_val_score:
                    best_val_score = val_score

                    best_epoch = n_epoch
                    best_weights = self.weights
                    best_bias = self.bias
                    best_cache = self.cache

                if self.debug and (n_epoch == self.max_iterations or n_epoch % self.debug_at == 0):
                    print("Epoch #", n_epoch, " Train: ", train_score, " Val: ", val_score)

            # Update learning rate
            if self.learning_rate_scheduler:
                learning_rate = self.learning_rate_scheduler(learning_rate, n_epoch)

        # Set best epochs, weight, bias and cache
        self.best_epoch = best_epoch
        self.weights = best_weights
        self.bias = best_bias
        self.cache = best_cache

    def _activation(self, x: npt.NDArray):
        return np.dot(self.weights, x) + self.bias

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(self._activation(x))
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {
            "type": self.type,
            "max_iterations": self.best_epoch,
            "weights": self.weights.tolist(),
            "bias": float(self.bias),
            "best_epoch": self.best_epoch,
        }

    def load(self, model_data: Dict[str, Any]):
        self.type = model_data["type"]
        self.max_iterations = (model_data["max_iterations"],)
        self.weights = np.array(model_data["weights"])
        self.bias = model_data["bias"]
        self.best_epoch = model_data["best_epoch"]


### Sentiment Classification


In [76]:
averaged_perceptron_sentiment = AveragedPerceptron(
    max_iterations=2000,
    learning_rate=3,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

averaged_perceptron_sentiment.fit(X_all, y_all_sentiment, X_dev, y_dev_sentiment)

averaged_perceptron_sentiment_data = averaged_perceptron_sentiment.export()

Epoch # 50  Train:  0.7846532273837821  Val:  0.7901347430343019
Epoch # 100  Train:  0.8828758169934641  Val:  0.8430079673456574
Epoch # 150  Train:  0.9312070043777361  Val:  0.8718136608337974
Epoch # 200  Train:  0.9520700194498473  Val:  0.903101343101343
Epoch # 250  Train:  0.9687494574558586  Val:  0.906158357771261
Epoch # 300  Train:  0.97812497626408  Val:  0.912222265762765
Epoch # 350  Train:  0.9854160969829551  Val:  0.9184313725490196
Epoch # 400  Train:  0.9874995116996758  Val:  0.9311531841652323
Epoch # 450  Train:  0.9906241759529646  Val:  0.9249266862170088
Epoch # 500  Train:  0.9906241759529646  Val:  0.9249882794186592
Epoch # 550  Train:  0.9906241759529646  Val:  0.9312473143482167
Epoch # 600  Train:  0.9906241759529646  Val:  0.9375
Epoch # 650  Train:  0.9906241759529646  Val:  0.943747802648541
Epoch # 700  Train:  0.9906241759529646  Val:  0.9468703304001328
Epoch # 750  Train:  0.9906241759529646  Val:  0.9437412095639944
Epoch # 800  Train:  0.991666

In [77]:
averaged_perceptron_sentiment_data["best_epoch"]

699

In [78]:
y_dev_sentiment_pred = averaged_perceptron_sentiment.predict(X_dev)

calculate_scores(y_all_sentiment, averaged_perceptron_sentiment.predict(X_all), title="All")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ All ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95       160
           1       0.94      0.96      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


### Truthful Classification


In [81]:
averaged_perceptron_truthfulness = AveragedPerceptron(
    max_iterations=800,
    learning_rate=815e-2,
    shuffle=True,
    score_func=partial(calculate_f1_score, average="macro"),
    rng=rng,
    debug=True,
    debug_at=50,
)

averaged_perceptron_truthfulness.fit(X_all, y_all_truthfulness, X_dev, y_dev_truthfulness)
averaged_perceptron_truthfulness_data = averaged_perceptron_truthfulness.export()


Epoch # 50  Train:  0.803666862939873  Val:  0.6976562261889039
Epoch # 100  Train:  0.9393915644552424  Val:  0.7935483870967742
Epoch # 150  Train:  0.9697677524865046  Val:  0.8059771553747457
Epoch # 200  Train:  0.9760301967412792  Val:  0.817545515315953
Epoch # 250  Train:  0.9770712917808397  Val:  0.8037664187371418
Epoch # 300  Train:  0.9791576204950065  Val:  0.7727977279772797
Epoch # 350  Train:  0.9791576204950065  Val:  0.7684049696433597
Epoch # 400  Train:  0.9812434058848815  Val:  0.7649308176100629
Epoch # 450  Train:  0.9812434058848815  Val:  0.7608950280706006
Epoch # 500  Train:  0.9822861118558228  Val:  0.757377248051436
Epoch # 550  Train:  0.9822861118558228  Val:  0.7467429684646293
Epoch # 600  Train:  0.9822861118558228  Val:  0.7425006366182838
Epoch # 650  Train:  0.9822861118558228  Val:  0.7352697858259032
Epoch # 700  Train:  0.983328702417338  Val:  0.7316314842088039
Epoch # 750  Train:  0.983328702417338  Val:  0.7316314842088039
Epoch # 800  Tra

In [82]:
averaged_perceptron_truthfulness_data["best_epoch"]


160

In [83]:
y_dev_truthfulness_pred = averaged_perceptron_truthfulness.predict(X_dev)

calculate_scores(y_all_truthfulness, averaged_perceptron_truthfulness.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")

del averaged_perceptron_truthfulness


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       0.95      1.00      0.97       480
           1       1.00      0.95      0.97       480

    accuracy                           0.97       960
   macro avg       0.98      0.97      0.97       960
weighted avg       0.98      0.97      0.97       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.80      0.86      0.83       160
           1       0.85      0.79      0.82       160

    accuracy                           0.82       320
   macro avg       0.82      0.82      0.82       320
weighted avg       0.82      0.82      0.82       320

---------------------------------------------------------


#### Averaging the Scores


In [84]:
sentiment_f1_score = calculate_f1_score(y_dev_sentiment, y_dev_sentiment_pred, average="macro")
truthfulness_f1_score = calculate_f1_score(y_dev_truthfulness, y_dev_truthfulness_pred, average="macro")


In [85]:
np.mean([sentiment_f1_score, truthfulness_f1_score])


0.8842673008290811

#### Write Averaged Models


In [86]:
averaged_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": averaged_perceptron_sentiment_data,
    "truthfulness_classifier": averaged_perceptron_truthfulness_data,
}

store_model(AVERAGED_MODEL_FILE_PATH, averaged_model_file_data)


#### Test Loaded Model


In [87]:
tf_idf_model_data, averaged_perceptron_sentiment_data, averaged_perceptron_truthfulness_data = load_model(
    AVERAGED_MODEL_FILE_PATH
)


In [88]:
tf_idf_saved_model = TfIdf()
tf_idf_saved_model.load(tf_idf_model_data)

averaged_perceptron_sentiment_saved = AveragedPerceptron()
averaged_perceptron_sentiment_saved.load(averaged_perceptron_sentiment_data)

averaged_perceptron_truthfulness_saved = AveragedPerceptron()
averaged_perceptron_truthfulness_saved.load(averaged_perceptron_truthfulness_data)


In [89]:
X_dev_tf_idf_vectors_saved = tf_idf_saved_model.transform(dev_tokenized)
X_dev_tf_idf_vectors_saved.shape


(320, 7675)

In [90]:
y_pred_sentiment = averaged_perceptron_sentiment_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_sentiment, averaged_perceptron_sentiment_saved.predict(X_all), title="Whole")
calculate_scores(y_dev_sentiment, y_dev_sentiment_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       480
           1       0.98      1.00      0.99       480

    accuracy                           0.99       960
   macro avg       0.99      0.99      0.99       960
weighted avg       0.99      0.99      0.99       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.96      0.94      0.95       160
           1       0.94      0.96      0.95       160

    accuracy                           0.95       320
   macro avg       0.95      0.95      0.95       320
weighted avg       0.95      0.95      0.95       320

---------------------------------------------------------


In [91]:
y_pred_truthfulness = averaged_perceptron_truthfulness_saved.predict(X_dev_tf_idf_vectors_saved)

calculate_scores(y_all_truthfulness, averaged_perceptron_truthfulness_saved.predict(X_all), title="Whole")
calculate_scores(y_dev_truthfulness, y_dev_truthfulness_pred, title="Dev")


------------------------ Whole ------------------------
              precision    recall  f1-score   support

          -1       0.95      1.00      0.97       480
           1       1.00      0.95      0.97       480

    accuracy                           0.97       960
   macro avg       0.98      0.97      0.97       960
weighted avg       0.98      0.97      0.97       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

          -1       0.80      0.86      0.83       160
           1       0.85      0.79      0.82       160

    accuracy                           0.82       320
   macro avg       0.82      0.82      0.82       320
weighted avg       0.82      0.82      0.82       320

---------------------------------------------------------


#### Write Predictions


In [None]:
output = list()
for (id, truthfulness, sentiment) in zip(
    dev_raw_data[:, 0],
    np.where(y_pred_truthfulness == -1, DECEPTIVE, TRUTHFUL),
    np.where(y_pred_sentiment == -1, NEGATIVE, POSITIVE),
):
    output.append((id, truthfulness, sentiment))


In [None]:
store_predictions(OUTPUT_FILE_PATH, output)
