In [1]:
import re

import json
from typing import List, Dict, Tuple, Set, Any

import numpy as np
import numpy.typing as npt


# Define Constants


In [472]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"
OUTPUT_PATH = "./output"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"
OUTPUT_FILENAME = "output.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

SENTIMENT_CLASS_DICT = {1: "Pos", -1: "Neg"}

TRUTHFULNESS_CLASS_DICT = {1: "True", -1: "Fake"}

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

OUTPUT_FILE_PATH = f"{OUTPUT_PATH}/{OUTPUT_FILENAME}"

DEV_DATA_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_KEY_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_COL = 3
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1
TEST_SIZE = 0.2


In [3]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [4]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [5]:
input_regex = re.compile("(\w*) (\w*) (\w*) (.*)\n?")


In [6]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [7]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [8]:
np.array(data)[0, 1]


'Fake'

In [186]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = "(\w*) (\w*) (\w*) (.*)\n?"
    if type == "DEV":
        regex = "(\w*) (.*)\n?"
    elif type == "KEY":
        regex = "(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [10]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [11]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [12]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)

    return (model_data["tf_idf_model"], model_data["sentiment_classifier"], model_data["truthfulness_classifier"])


In [13]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [87]:
def calculate_accuracy_score(y_true: npt.NDArray, y_pred: npt.NDArray):
    return (y_true == y_pred).sum() / y_true.shape[0]

In [52]:
def calculate_f1_score(y_true: npt.NDArray, y_pred: npt.NDArray, type: str = "macro"):
    def calculate_f1(y_true, y_pred, label):
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_pred != label) & (y_true == label))

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def macro_f1(y_true, y_pred):
        return np.mean([calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)])

    def micro_f1(y_true, y_pred):
        return {label: calculate_f1(y_true, y_pred, label) for label in np.unique(y_true)}

    if type == "macro":
        return macro_f1(y_true, y_pred)
    elif type == "micro":
        return micro_f1(y_true, y_pred)
    else:
        return {"micro": micro_f1(y_true, y_pred), "macro": macro_f1(y_true, y_pred)}


In [15]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("---------------------------------------------------------")


# Load Data


In [16]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")


# Data Cleaning


In [17]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [18]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [19]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [20]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [21]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [22]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [23]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [24]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [25]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    from contractions import fix

    def contraction_fixer(txt: str):
        return " ".join([fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [26]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [27]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


# Data Preprocessing


Not Applicable since everything has to be implemented from scratch.


# Feature Extraction


In [194]:
class TfIdf:
    def __init__(self) -> None:
        self.n_docs: int = None
        self.vocab: List = list()
        self.vocab_size: int = None
        self.vocab_index: Dict[str, int] = dict()
        self.word_document_count: Dict[str, int] = dict()

    def __create_vocab__(self, documents: npt.NDArray) -> Set:
        vocab = set()

        for document in documents:
            for word in document:
                vocab.add(word)

        return list(vocab)

    def __get_word_document_count__(self, documents: npt.NDArray):
        word_document_count = dict()

        for document in documents:
            for word in document:
                if word in self.vocab:
                    if word not in word_document_count:
                        word_document_count[word] = 1
                    else:
                        word_document_count[word] += 1

        return word_document_count

    def __term_frequency__(self, word: str, document: npt.NDArray):
        word_occurences = (document == word).sum()
        return word_occurences / self.n_docs

    def __inverse_document_frequency__(self, word: str):
        word_occurrences = 1

        if word in self.word_document_count:
            word_occurrences += self.word_document_count[word]

        return np.log(self.n_docs / word_occurrences)

    def __tf_idf__(self, document: npt.NDArray):
        tf_idf_vector = np.zeros(shape=(self.vocab_size,))
        for word in document:
            # ignore word not in vocab
            if word in self.vocab:
                tf = self.__term_frequency__(word, document)
                idf = self.__inverse_document_frequency__(word)

                tf_idf_vector[self.vocab_index[word]] = tf * idf
        return tf_idf_vector

    def fit(self, documents: npt.NDArray):
        self.n_docs = documents.shape[0]
        self.vocab = self.__create_vocab__(documents)
        self.vocab_size = len(self.vocab)
        self.vocab_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.word_document_count = self.__get_word_document_count__(documents)

    def transform(self, documents: npt.NDArray):
        tf_idf_vectors = list()
        for document in documents:
            tf_idf_vectors.append(self.__tf_idf__(document))
        return np.array(tf_idf_vectors)

    def export(self):
        return {
            "n_docs": self.n_docs,
            "vocab_size": self.vocab_size,
            "vocab": self.vocab,
            "vocab_size": self.vocab_size,
            "vocab_index": self.vocab_index,
            "word_document_count": self.word_document_count,
        }

    def load(self, tf_idf_model_data):
        self.n_docs = tf_idf_model_data["n_docs"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab = tf_idf_model_data["vocab"]
        self.vocab_size = tf_idf_model_data["vocab_size"]
        self.vocab_index = tf_idf_model_data["vocab_index"]
        self.word_document_count = tf_idf_model_data["word_document_count"]


In [29]:
final_data = load_data(CLEANED_DATA_FILE_PATH)


In [30]:
def tokenize(data: npt.NDArray):
    tokenized_documents = list()
    for document in data:
        tokenized_documents.append(np.array(document.split()))
    return np.array(tokenized_documents, dtype=object)


In [31]:
tokenized_documents = tokenize(final_data[:, DATA_COL])
tokenized_documents[0]


array(['if', 'you', 'are', 'looking', 'for', 'an', 'elegant', 'hotel',
       'in', 'downtown', 'chicago', 'you', 'have', 'to', 'stay', 'here',
       'the', 'ambassador', 'east', 'hotel', 'has', 'very', 'comfortable',
       'and', 'beautiful', 'large', 'rooms', 'and', 'is', 'like', 'a',
       'home', 'away', 'from', 'home', 'the', 'perfect', 'place', 'for',
       'a', 'business', 'person', 'and', 'if', 'you', 'have', 'a',
       'small', 'pet', 'you', 'can', 'bring', 'them', 'too', 'i', 'would',
       'give', 'this', 'place', 'four', 'stars', 'and', 'would',
       'definitely', 'stay', 'here', 'again'], dtype='<U11')

In [196]:
# TODO: Invoke TF-IDF
tf_idf_model = TfIdf()
tf_idf_model.fit(tokenized_documents)


In [197]:
tf_idf_vectors = tf_idf_model.transform(tokenized_documents)
tf_idf_vectors.shape


(960, 7675)

In [198]:
tf_idf_model_data = tf_idf_model.export()


In [117]:
sentiment_labels = np.where(final_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
truthfulness_labels = np.where(final_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)


# Split Data


In [118]:
def train_test_split(
    data: npt.NDArray, sentiment_labels: npt.NDArray, truthfulness_labels: npt.NDArray, test_size: float = 0.2
):
    n_max = data.shape[0]
    sample = int((1 - test_size) * n_max)

    # Shuffle the data
    all_idx = np.random.permutation(n_max)
    train_idx, test_idx = all_idx[:sample], all_idx[sample:]

    X_train, X_test, y_train_sentiment, y_train_truthfulness, y_test_sentiment, y_test_truthfulness = (
        data[train_idx],
        data[test_idx],
        sentiment_labels[train_idx],
        truthfulness_labels[train_idx],
        sentiment_labels[test_idx],
        truthfulness_labels[test_idx],
    )

    return X_train, X_test, y_train_sentiment, y_train_truthfulness, y_test_sentiment, y_test_truthfulness


In [119]:
X_train, X_test, y_train_sentiment, y_train_truthfulness, y_test_sentiment, y_test_truthfulness = train_test_split(
    tf_idf_vectors, sentiment_labels, truthfulness_labels, TEST_SIZE
)


# Perceptron Models


## Vanilla Perceptron


In [120]:
from sklearn.metrics import accuracy_score

In [261]:
class VanillaPerceptron:
    def __init__(self) -> None:
        pass

    def fit(
        self,
        X: npt.NDArray,
        y: npt.NDArray,
        max_iterations: int,
        learning_rate: float = 1e-2,
        tolerance: float = 1e-2,
        shuffle: bool = True,
        class_weights: dict = None,
    ):
        n_epoch = 0

        self.weights: npt.NDArray = np.zeros(shape=(X.shape[-1],))
        self.bias: float = 0

        for epoch in range(max_iterations):
            n_epoch = epoch

            if shuffle:
                idxs = np.random.permutation(X.shape[0])
                X = X[idxs]
                y = y[idxs]

            for x, y_true in zip(X, y):

                a = np.dot(self.weights, x) + self.bias
                if y_true * a <= 0:
                    if class_weights is None:
                        self.weights = self.weights + y_true * x
                    else:
                        self.weights = self.weights + y_true * x * class_weights[y_true]
                    self.bias = self.bias + y_true

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(np.dot(self.weights, x) + self.bias)
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"weights": self.weights.tolist(), "bias": self.bias}

    def load(self, perceptron_data: Dict[str, Any]):
        self.weights = np.array(perceptron_data["weights"])
        self.bias = perceptron_data["bias"]


## Sentiment Classification


#### Learn


In [179]:
from sklearn.metrics import f1_score


In [180]:
epochs = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]

for epoch in epochs:
    vanilla_perceptron_sentiment = VanillaPerceptron()
    vanilla_perceptron_sentiment.fit(
        X_train, y_train_sentiment, max_iterations=epoch, learning_rate=1e-2, tolerance=1e-8, shuffle=True
    )

    y_pred_train_sentiment = vanilla_perceptron_sentiment.predict(X_train)
    y_pred_sentiment = vanilla_perceptron_sentiment.predict(X_test)
    
    print("Epoch #", epoch, " Train: ", f1_score(y_train_sentiment, y_pred_train_sentiment), " Test: ", f1_score(y_test_sentiment, y_pred_sentiment))

    del vanilla_perceptron_sentiment

Epoch # 100  Train:  0.7195467422096318  Test:  0.7279411764705882
Epoch # 150  Train:  0.6455026455026455  Test:  0.557142857142857
Epoch # 200  Train:  0.7094972067039106  Test:  0.7148014440433214
Epoch # 250  Train:  0.7175141242937852  Test:  0.7252747252747253
Epoch # 300  Train:  0.8751835535976504  Test:  0.7261146496815287
Epoch # 350  Train:  0.936111111111111  Test:  0.7341772151898734
Epoch # 400  Train:  0.9578231292517007  Test:  0.7804878048780488
Epoch # 450  Train:  0.962059620596206  Test:  0.7654320987654321
Epoch # 500  Train:  0.971736204576043  Test:  0.7577639751552795
Epoch # 550  Train:  0.9758713136729223  Test:  0.7654320987654321
Epoch # 600  Train:  0.9359331476323121  Test:  0.7577639751552795
Epoch # 650  Train:  0.9853917662682602  Test:  0.7499999999999999
Epoch # 700  Train:  0.9894179894179893  Test:  0.7499999999999999
Epoch # 750  Train:  0.9894179894179893  Test:  0.7499999999999999
Epoch # 800  Train:  0.9907529722589168  Test:  0.7499999999999999

In [187]:
dev_raw_data = load_data(DEV_DATA_FILE_PATH, type="DEV")
dev_key_data = load_data(DEV_KEY_FILE_PATH, type="KEY")

In [188]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    1: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

dev_cleaned_data = dev_raw_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = dev_cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    dev_cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [199]:
tokenized_dev_documents = tokenize(dev_cleaned_data[:,1])

dev_tf_idf_vectors = tf_idf_model.transform(tokenized_dev_documents)

In [357]:
vanilla_perceptron_sentiment = VanillaPerceptron()
vanilla_perceptron_sentiment.fit(
    tf_idf_vectors, sentiment_labels, max_iterations=900, learning_rate=1e-2, tolerance=1e-8, shuffle=True
)

vanilla_perceptron_sentiment_data = vanilla_perceptron_sentiment.export()

In [358]:
dev_sentiment_pred = vanilla_perceptron_sentiment.predict(dev_tf_idf_vectors)

In [359]:
calculate_scores(final_data[:, SENTIMENT_TARGET_COL], np.where(vanilla_perceptron_sentiment.predict(tf_idf_vectors) == -1, "Neg", "Pos"), title="Train")
calculate_scores(dev_key_data[:, 2], np.where(dev_sentiment_pred == -1, "Neg", "Pos"), title="Dev")

------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       1.00      0.99      1.00       480
         Pos       0.99      1.00      1.00       480

    accuracy                           1.00       960
   macro avg       1.00      1.00      1.00       960
weighted avg       1.00      1.00      1.00       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

         Neg       0.78      0.99      0.87       160
         Pos       0.99      0.72      0.83       160

    accuracy                           0.86       320
   macro avg       0.89      0.86      0.85       320
weighted avg       0.89      0.86      0.85       320

---------------------------------------------------------


In [361]:
neg_f1_score = f1_score(np.where(dev_key_data[:, 2] == "Neg", -1, 1), dev_sentiment_pred, pos_label=-1)
pos_f1_score = f1_score(np.where(dev_key_data[:, 2] == "Neg", -1, 1), dev_sentiment_pred, pos_label=1)

In [225]:
epochs = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]

for epoch in epochs:
    vanilla_perceptron_truthfulness = VanillaPerceptron()
    vanilla_perceptron_truthfulness.fit(
        X_train, y_train_sentiment, max_iterations=epoch, learning_rate=1e-2, tolerance=1e-8, shuffle=True
    )

    y_pred_train_truthfulness = vanilla_perceptron_truthfulness.predict(X_train)
    y_pred_truthfulness = vanilla_perceptron_truthfulness.predict(X_test)
    
    print("Epoch #", epoch, " Train: ", f1_score(y_train_sentiment, y_pred_train_truthfulness), " Test: ", f1_score(y_test_sentiment, y_pred_truthfulness))

    del vanilla_perceptron_truthfulness

Epoch # 100  Train:  0.7277936962750716  Test:  0.7279411764705882
Epoch # 150  Train:  0.6234234234234234  Test:  0.5255474452554744
Epoch # 200  Train:  0.7068645640074211  Test:  0.7148014440433214
Epoch # 250  Train:  0.8651851851851852  Test:  0.7051282051282051
Epoch # 300  Train:  0.9211267605633803  Test:  0.7577639751552795
Epoch # 350  Train:  0.7270992366412213  Test:  0.7252747252747253
Epoch # 400  Train:  0.9119318181818181  Test:  0.7499999999999999
Epoch # 450  Train:  0.9432918395573997  Test:  0.7654320987654321
Epoch # 500  Train:  0.7426900584795322  Test:  0.7279411764705882
Epoch # 550  Train:  0.9813333333333333  Test:  0.7577639751552795
Epoch # 600  Train:  0.9853917662682602  Test:  0.7577639751552795
Epoch # 650  Train:  0.9867374005305041  Test:  0.7499999999999999
Epoch # 700  Train:  0.9813333333333333  Test:  0.7499999999999999
Epoch # 750  Train:  0.9534246575342467  Test:  0.7499999999999999
Epoch # 800  Train:  0.992084432717678  Test:  0.7577639751552

In [381]:
vanilla_perceptron_truthfulness = VanillaPerceptron()
vanilla_perceptron_truthfulness.fit(
    tf_idf_vectors, truthfulness_labels, max_iterations=800, learning_rate=1e-2, tolerance=1e-8, shuffle=True, class_weights={
        -1: 1.0275, 1: 1.0
    }
)


In [382]:
dev_truthfulness_pred = vanilla_perceptron_truthfulness.predict(dev_tf_idf_vectors)

In [383]:
calculate_scores(final_data[:, TRUTHFULNESS_TARGET_COL], np.where(vanilla_perceptron_truthfulness.predict(tf_idf_vectors) == -1, "Fake", "True"), title="Train")
calculate_scores(dev_key_data[:, 1], np.where(dev_truthfulness_pred == -1, "Fake", "True"), title="Dev")

------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00       480
        True       1.00      1.00      1.00       480

    accuracy                           1.00       960
   macro avg       1.00      1.00      1.00       960
weighted avg       1.00      1.00      1.00       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

        Fake       0.98      0.29      0.45       160
        True       0.58      0.99      0.74       160

    accuracy                           0.64       320
   macro avg       0.78      0.64      0.59       320
weighted avg       0.78      0.64      0.59       320

---------------------------------------------------------


In [384]:
fake_f1_score = f1_score(np.where(dev_key_data[:, 1] == "Fake", -1, 1), dev_truthfulness_pred, pos_label=-1)
truth_f1_score = f1_score(np.where(dev_key_data[:, 1] == "Fake", -1, 1), dev_truthfulness_pred, pos_label=1)

In [385]:
neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score

(0.8736263736263736,
 0.8333333333333334,
 0.4519230769230769,
 0.7361111111111112)

In [386]:
np.mean([neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score])

0.7237484737484738

#### Classify


In [170]:
y_pred_sentiment = vanilla_perceptron_truthfulness.predict(X_test)


In [171]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(vanilla_perceptron_truthfulness.predict(X_train) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       1.00      0.99      1.00       387
         Pos       0.99      1.00      1.00       381

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.76      0.99      0.86        93
         Pos       0.99      0.71      0.82        99

    accuracy                           0.84       192
   macro avg       0.87      0.85      0.84       192
weighted avg       0.88      0.84      0.84       192

---------------------------------------------------------


## Truthfulness Classification


#### Learn


In [303]:
vanilla_perceptron_truthfulness = VanillaPerceptron()
vanilla_perceptron_truthfulness.fit(
    X_train, y_train_truthfulness, max_iterations=1000, learning_rate=1e-12, shuffle=True
)

vanilla_perceptron_truthfulness_data = vanilla_perceptron_truthfulness.export()


Epoch: 16


#### Classify


In [304]:
y_pred_truthfulness = vanilla_perceptron_truthfulness.predict(X_test)


In [305]:
calculate_scores(
    np.where(y_train_truthfulness == 0, "Fake", "True"),
    np.where(vanilla_perceptron_truthfulness.predict(X_train) == 0, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == 0, "Fake", "True"),
    np.where(y_pred_truthfulness == 0, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00       383
        True       1.00      1.00      1.00       385

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.98      0.43      0.60        97
        True       0.63      0.99      0.77        95

    accuracy                           0.71       192
   macro avg       0.80      0.71      0.69       192
weighted avg       0.81      0.71      0.68       192

---------------------------------------------------------


#### Write Vanilla Models


In [306]:
vanilla_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": vanilla_perceptron_sentiment_data,
    "truthfulness_classifier": vanilla_perceptron_truthfulness_data,
}

store_model(VANILLA_MODEL_FILE_PATH, vanilla_model_file_data)


#### Load Vanilla Models


In [65]:
tf_idf_model_data, vanilla_perceptron_sentiment_data, vanilla_perceptron_truthfulness_data = load_model(
    VANILLA_MODEL_FILE_PATH
)


In [66]:
tf_idf_model = TfIdf()
tf_idf_model.load(tf_idf_model_data)

vanilla_perceptron_sentiment = VanillaPerceptron()
vanilla_perceptron_sentiment.load(vanilla_perceptron_sentiment_data)

vanilla_perceptron_truthfulness = VanillaPerceptron()
vanilla_perceptron_truthfulness.load(vanilla_perceptron_truthfulness_data)


##### Test Loaded Model


In [67]:
tf_idf_vectors = tf_idf_model.transform(tokenized_documents)
tf_idf_vectors.shape


(960, 7655)

In [68]:
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(
    tf_idf_vectors, sentiment_labels, TEST_SIZE
)
X_train_truthfulness, X_test_truthfulness, y_train_truthfulness, y_test_truthfulness = train_test_split(
    tf_idf_vectors, truthfulness_labels, TEST_SIZE
)


In [69]:
y_pred_sentiment = vanilla_perceptron_sentiment.predict(X_test_sentiment)


In [70]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(vanilla_perceptron_sentiment.predict(X_train_sentiment) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       0.72      1.00      0.84       381
         Pos       1.00      0.62      0.76       387

    accuracy                           0.81       768
   macro avg       0.86      0.81      0.80       768
weighted avg       0.86      0.81      0.80       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.74      0.99      0.85        99
         Pos       0.98      0.63      0.77        93

    accuracy                           0.82       192
   macro avg       0.86      0.81      0.81       192
weighted avg       0.86      0.82      0.81       192

---------------------------------------------------------


In [73]:
y_pred_truthfulness = vanilla_perceptron_truthfulness.predict(X_test_truthfulness)


In [74]:
calculate_scores(
    np.where(y_train_truthfulness == -1, "Fake", "True"),
    np.where(vanilla_perceptron_truthfulness.predict(X_train_truthfulness) == -1, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == -1, "Fake", "True"),
    np.where(y_pred_truthfulness == -1, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       0.99      0.95      0.97       385
        True       0.95      0.99      0.97       383

    accuracy                           0.97       768
   macro avg       0.97      0.97      0.97       768
weighted avg       0.97      0.97      0.97       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.99      0.89      0.94        95
        True       0.91      0.99      0.95        97

    accuracy                           0.94       192
   macro avg       0.95      0.94      0.94       192
weighted avg       0.95      0.94      0.94       192

---------------------------------------------------------


--------------------------------------------------------- ********\*\*********\*\*\*********\*\********* -------------------------------------------------


## Averaged Perceptron


In [448]:
class AveragedPerceptron:
    def __init__(self) -> None:
        pass

    # TODO: Can implement this ... tolerance: float, early_stopping: bool = True
    def fit(
        self,
        X: npt.NDArray,
        y: npt.NDArray,
        max_iterations: int,
        learning_rate: float = 1e-2,
        tolerance: float = 1e-2,
        shuffle: bool = True,
        class_weights: dict = None,
    ):
        misclassified = 0
        n_epoch = 0

        self.weights = np.zeros(shape=(X.shape[-1],))
        self.bias = 0.0
        self.cache = {"weights": np.zeros(shape=(X.shape[-1],)), "bias": 0.0}

        c = 1
        for epoch in range(max_iterations):
            n_epoch = epoch

            if shuffle:
                idxs = np.random.permutation(X.shape[0])
                X = X[idxs]
                y = y[idxs]

            for x, y_true in zip(X, y):

                a = np.dot(self.weights, x) + self.bias
                if y_true * a <= 0:
                    if class_weights is None:
                        self.weights = self.weights + y_true * x
                    else:
                        self.weights = self.weights + y_true * x * class_weights[y_true]

                    self.bias = self.bias + y_true

                    self.cache["weights"] = self.cache["weights"] + y_true * c * x
                    self.cache["bias"] = self.cache["bias"] + y_true * c

                c += 1

        self.weights = self.weights - ((1 / c) * self.cache["weights"])
        self.bias = self.bias - ((1 / c) * self.cache["bias"])

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            pred = np.sign(np.dot(self.weights, x) + self.bias)
            predictions.append(pred)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"weights": self.weights.tolist(), "biases": self.bias}

    def load(self, perceptron_data: Dict[str, Any]):
        self.weights = perceptron_data["weights"]
        self.bias = perceptron_data["bias"]


### Sentiment Classification


In [456]:
epochs = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]

for epoch in epochs:
    averaged_perceptron_sentiment = AveragedPerceptron()
    averaged_perceptron_sentiment.fit(
        X_train, y_train_sentiment, max_iterations=epoch, learning_rate=1e-2, tolerance=1e-8, shuffle=True
    )

    y_pred_train_sentiment = averaged_perceptron_sentiment.predict(X_train)
    y_pred_sentiment = averaged_perceptron_sentiment.predict(X_test)
    
    print("Epoch #", epoch, " Train: ", f1_score(y_train_sentiment, y_pred_train_sentiment), " Test: ", f1_score(y_test_sentiment, y_pred_sentiment))

    del averaged_perceptron_sentiment

Epoch # 100  Train:  0.7685290763968073  Test:  0.7381974248927038
Epoch # 150  Train:  0.8041237113402061  Test:  0.7631578947368421
Epoch # 200  Train:  0.8370457209847597  Test:  0.7892376681614349
Epoch # 250  Train:  0.8619447779111644  Test:  0.8054298642533937
Epoch # 300  Train:  0.8812729498164015  Test:  0.8240740740740741
Epoch # 350  Train:  0.9007444168734491  Test:  0.827906976744186
Epoch # 400  Train:  0.9166666666666666  Test:  0.8380952380952381
Epoch # 450  Train:  0.9273885350318471  Test:  0.8421052631578948
Epoch # 500  Train:  0.9346991037131882  Test:  0.8557692307692307
Epoch # 550  Train:  0.9435897435897436  Test:  0.859903381642512
Epoch # 600  Train:  0.9472329472329473  Test:  0.8627450980392156
Epoch # 650  Train:  0.9521345407503234  Test:  0.8613861386138614
Epoch # 700  Train:  0.9547218628719275  Test:  0.8542713567839195
Epoch # 750  Train:  0.9571984435797665  Test:  0.8615384615384615
Epoch # 800  Train:  0.9622886866059819  Test:  0.86597938144329

In [457]:
averaged_perceptron_sentiment = AveragedPerceptron()
averaged_perceptron_sentiment.fit(
    tf_idf_vectors, sentiment_labels, max_iterations=800, learning_rate=1e-2, tolerance=1e-8, shuffle=True
)

# averaged_perceptron_sentiment_data = averaged_perceptron_sentiment.export()


In [458]:
dev_sentiment_pred = averaged_perceptron_sentiment.predict(dev_tf_idf_vectors)

In [459]:
calculate_scores(final_data[:, SENTIMENT_TARGET_COL], np.where(averaged_perceptron_sentiment.predict(tf_idf_vectors) == -1, "Neg", "Pos"), title="Train")
calculate_scores(dev_key_data[:, 2], np.where(dev_sentiment_pred == -1, "Neg", "Pos"), title="Dev")

------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       0.97      0.97      0.97       480
         Pos       0.97      0.97      0.97       480

    accuracy                           0.97       960
   macro avg       0.97      0.97      0.97       960
weighted avg       0.97      0.97      0.97       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

         Neg       0.88      0.95      0.92       160
         Pos       0.95      0.88      0.91       160

    accuracy                           0.91       320
   macro avg       0.91      0.91      0.91       320
weighted avg       0.91      0.91      0.91       320

---------------------------------------------------------


In [460]:
neg_f1_score = f1_score(np.where(dev_key_data[:, 2] == "Neg", -1, 1), dev_sentiment_pred, pos_label=-1)
pos_f1_score = f1_score(np.where(dev_key_data[:, 2] == "Neg", -1, 1), dev_sentiment_pred, pos_label=1)

In [462]:
epochs = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]

for epoch in epochs:
    averaged_perceptron_truthfulness = AveragedPerceptron()
    averaged_perceptron_truthfulness.fit(
        X_train, y_train_truthfulness, max_iterations=epoch, learning_rate=1e-2, tolerance=1e-8, shuffle=True
    )

    y_pred_train_truthfulness = averaged_perceptron_truthfulness.predict(X_train)
    y_pred_truthfulness = averaged_perceptron_truthfulness.predict(X_test)
    
    print("Epoch #", epoch, " Train: ", f1_score(y_train_truthfulness, y_pred_train_truthfulness), " Test: ", f1_score(y_test_truthfulness, y_pred_truthfulness))

    del averaged_perceptron_truthfulness


Epoch # 100  Train:  0.35887096774193544  Test:  0.17391304347826084
Epoch # 150  Train:  0.4362934362934363  Test:  0.21052631578947367
Epoch # 200  Train:  0.5046382189239331  Test:  0.22916666666666663
Epoch # 250  Train:  0.5898778359511343  Test:  0.28
Epoch # 300  Train:  0.6338983050847457  Test:  0.3364485981308411
Epoch # 350  Train:  0.7266982622432858  Test:  0.4347826086956521
Epoch # 400  Train:  0.7687595712098009  Test:  0.46153846153846156
Epoch # 450  Train:  0.7909774436090227  Test:  0.5396825396825395
Epoch # 500  Train:  0.8228404099560761  Test:  0.5625
Epoch # 550  Train:  0.8579545454545454  Test:  0.5925925925925926
Epoch # 600  Train:  0.8739495798319328  Test:  0.5985401459854014
Epoch # 650  Train:  0.897119341563786  Test:  0.647887323943662
Epoch # 700  Train:  0.9105691056910569  Test:  0.6887417218543046
Epoch # 750  Train:  0.9249329758713137  Test:  0.6883116883116883
Epoch # 800  Train:  0.929427430093209  Test:  0.6962025316455697
Epoch # 850  Train:

In [463]:
averaged_perceptron_truthfulness = AveragedPerceptron()
averaged_perceptron_truthfulness.fit(
    tf_idf_vectors, truthfulness_labels, max_iterations=1000, learning_rate=1e-2, tolerance=1e-8, shuffle=True, class_weights={
        -1: 1.0275, 1: 1.0
    }
)


In [464]:
dev_truthfulness_pred = averaged_perceptron_truthfulness.predict(dev_tf_idf_vectors)

In [465]:
calculate_scores(final_data[:, TRUTHFULNESS_TARGET_COL], np.where(averaged_perceptron_truthfulness.predict(tf_idf_vectors) == -1, "Fake", "True"), title="Train")
calculate_scores(dev_key_data[:, 1], np.where(dev_truthfulness_pred == -1, "Fake", "True"), title="Dev")


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       0.61      1.00      0.76       480
        True       1.00      0.37      0.54       480

    accuracy                           0.68       960
   macro avg       0.81      0.68      0.65       960
weighted avg       0.81      0.68      0.65       960

---------------------------------------------------------
------------------------ Dev ------------------------
              precision    recall  f1-score   support

        Fake       0.54      1.00      0.70       160
        True       1.00      0.14      0.24       160

    accuracy                           0.57       320
   macro avg       0.77      0.57      0.47       320
weighted avg       0.77      0.57      0.47       320

---------------------------------------------------------


In [None]:
fake_f1_score = f1_score(np.where(dev_key_data[:, 1] == "Fake", -1, 1), dev_truthfulness_pred, pos_label=-1)
truth_f1_score = f1_score(np.where(dev_key_data[:, 1] == "Fake", -1, 1), dev_truthfulness_pred, pos_label=1)

In [None]:
np.mean([neg_f1_score, pos_f1_score, fake_f1_score, truth_f1_score])

In [471]:
output = list()
for (id, truthfulness, sentiment) in zip(dev_raw_data[:,0], np.where(dev_truthfulness_pred == -1, "Fake", "True"), np.where(dev_sentiment_pred == -1, "Neg", "Pos")):
    output.append((id, truthfulness, sentiment))


In [473]:
store_predictions(OUTPUT_FILE_PATH, output)

#### Learn


In [396]:
averaged_perceptron_sentiment = AveragedPerceptron()
averaged_perceptron_sentiment.fit(X_train, y_train_sentiment, max_iterations=epoch, learning_rate=1e-2, tolerance=1e-8, shuffle=True)

averaged_perceptron_sentiment_data = averaged_perceptron_sentiment.export()


#### Classify


In [397]:
y_pred_sentiment = averaged_perceptron_sentiment.predict(X_test)


In [398]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(averaged_perceptron_sentiment.predict(X_train) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       0.85      0.55      0.66       387
         Pos       0.66      0.90      0.76       381

    accuracy                           0.72       768
   macro avg       0.76      0.72      0.71       768
weighted avg       0.76      0.72      0.71       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.78      0.45      0.57        93
         Pos       0.63      0.88      0.73        99

    accuracy                           0.67       192
   macro avg       0.70      0.67      0.65       192
weighted avg       0.70      0.67      0.66       192

---------------------------------------------------------


### Truthfulness Classification


#### Learn


In [98]:
averaged_perceptron_truthfulness = AveragedPerceptron()
averaged_perceptron_truthfulness.fit(X_train_truthfulness, y_train_truthfulness, max_iterations=100)

averaged_perceptron_truthfulness_data = averaged_perceptron_truthfulness.export()


Epoch #: 1
Epoch #: 2
Epoch #: 3
Epoch #: 4
Epoch #: 5
Epoch #: 6
Epoch #: 7
Epoch #: 8
Epoch #: 9
Epoch #: 10
Epoch #: 11
Epoch #: 12
Epoch #: 13
Epoch #: 14
Epoch #: 15
Epoch #: 16
Epoch #: 17
Epoch #: 18
Epoch #: 19


#### Classify


In [99]:
y_pred_truthfulness = averaged_perceptron_truthfulness.predict(X_test_truthfulness)


In [100]:
calculate_scores(
    np.where(y_train_truthfulness == -1, "Fake", "True"),
    np.where(averaged_perceptron_truthfulness.predict(X_train_truthfulness) == -1, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == -1, "Fake", "True"),
    np.where(y_pred_truthfulness == -1, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       0.50      1.00      0.67       385
        True       0.00      0.00      0.00       383

    accuracy                           0.50       768
   macro avg       0.25      0.50      0.33       768
weighted avg       0.25      0.50      0.33       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.49      1.00      0.66        95
        True       0.00      0.00      0.00        97

    accuracy                           0.49       192
   macro avg       0.25      0.50      0.33       192
weighted avg       0.24      0.49      0.33       192

---------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Write Averaged Models


In [101]:
averaged_model_file_data = {
    "tf_idf_model": tf_idf_model_data,
    "sentiment_classifier": averaged_perceptron_sentiment_data,
    "truthfulness_classifier": averaged_perceptron_truthfulness_data,
}

store_model(AVERAGED_MODEL_FILE_PATH, averaged_model_file_data)


#### Load Averaged Models


tf_idf_model, averaged_perceptron_sentiment, averaged_perceptron_truthfulness = load_model(AVERAGED_MODEL_FILE_PATH)


In [102]:
tf_idf_vectors = tf_idf_model.transform(tokenized_documents)
tf_idf_vectors.shape


(960, 7655)

##### Test Loaded Models


In [103]:
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(
    tf_idf_vectors, sentiment_labels, TEST_SIZE
)
X_train_truthfulness, X_test_truthfulness, y_train_truthfulness, y_test_truthfulness = train_test_split(
    tf_idf_vectors, truthfulness_labels, TEST_SIZE
)


In [104]:
y_pred_sentiment = averaged_perceptron_sentiment.predict(X_test_sentiment)


In [105]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(averaged_perceptron_sentiment.predict(X_train_sentiment) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       0.95      0.99      0.97       380
         Pos       0.99      0.95      0.97       388

    accuracy                           0.97       768
   macro avg       0.97      0.97      0.97       768
weighted avg       0.97      0.97      0.97       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.98      0.99      0.99       100
         Pos       0.99      0.98      0.98        92

    accuracy                           0.98       192
   macro avg       0.98      0.98      0.98       192
weighted avg       0.98      0.98      0.98       192

---------------------------------------------------------


In [106]:
y_pred_truthfulness = averaged_perceptron_truthfulness.predict(X_test_truthfulness)


In [107]:
calculate_scores(
    np.where(y_train_truthfulness == -1, "Fake", "True"),
    np.where(averaged_perceptron_truthfulness.predict(X_train_truthfulness) == -1, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == -1, "Fake", "True"),
    np.where(y_pred_truthfulness == -1, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       0.51      1.00      0.68       392
        True       0.00      0.00      0.00       376

    accuracy                           0.51       768
   macro avg       0.26      0.50      0.34       768
weighted avg       0.26      0.51      0.34       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.46      1.00      0.63        88
        True       0.00      0.00      0.00       104

    accuracy                           0.46       192
   macro avg       0.23      0.50      0.31       192
weighted avg       0.21      0.46      0.29       192

---------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
