In [44]:
import re

import json
from typing import List, Dict, Tuple, Set, Any

import numpy as np
import numpy.typing as npt


# Define Constants


In [140]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

SENTIMENT_CLASS_DICT = {1: "Pos", -1: "Neg"}

TRUTHFULNESS_CLASS_DICT = {1: "True", -1: "Fake"}

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

DEV_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_TAGGED_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_COL = 3
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1
TEST_SIZE = 0.2


In [3]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [4]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [5]:
input_regex = re.compile("(\w*) (\w*) (\w*) (.*)\n?")


In [6]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [7]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [8]:
np.array(data)[0, 1]


'Fake'

In [9]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = "(\w*) (\w*) (\w*) (.*)\n?"
    if type == "TEST":
        regex = "(\w*) (.*)\n?"
    elif type == "LABELS":
        regex = "(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [10]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [11]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    # TODO: Need to check model_data
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [12]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    # TODO: Extract data from JSON and return that only
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)
    return model_data


In [13]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [185]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("---------------------------------------------------------")


# Load Data


In [15]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")


# Data Cleaning


In [16]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [17]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [18]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [19]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [20]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [21]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [22]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [23]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [24]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    # TODO: Replace with custom implementation
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [25]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [26]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


# Data Preprocessing


In [27]:
# def tokenize(data: pd.Series):
#     from nltk.tokenize import word_tokenize

#     nltk.download("punkt")

#     return data.apply(word_tokenize)


In [28]:
# def remove_stopwords(data: pd.Series):
#     """Remove stop words using the NLTK stopwords dictionary

#     Args:
#         string (str): a document

#     Returns:
#         str: a document with stopwords removed
#     """
#     from nltk.corpus import stopwords

#     nltk.download("stopwords")

#     stopwords = set(stopwords.words())

#     def remover(word_list: List[str], stopwords: Set[str]):
#         return [word for word in word_list if not word in stopwords]

#     return data.apply(lambda word_list: remover(word_list, stopwords))


In [29]:
# def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
#     from nltk.corpus import wordnet
#     from nltk.stem import WordNetLemmatizer

#     nltk.download("omw-1.4")

#     # POS tagging
#     def perform_nltk_pos_tag(data: pd.Series):
#         from nltk import pos_tag

#         nltk.download("averaged_perceptron_tagger")

#         return data.apply(pos_tag)

#     # Convert POS tag to wordnet pos tags
#     def wordnet_pos_tagger(tag: str):
#         if tag.startswith("J"):
#             return wordnet.ADJ
#         elif tag.startswith("V"):
#             return wordnet.VERB
#         elif tag.startswith("N"):
#             return wordnet.NOUN
#         elif tag.startswith("R"):
#             return wordnet.ADV
#         else:
#             return None

#     lemmatizer = WordNetLemmatizer()
#     lemmatized = list()

#     if consider_pos_tag:
#         pos_tagged_data = data.copy()
#         pos_tagged_data = perform_nltk_pos_tag(data)

#         for row in pos_tagged_data:

#             lemmatized_row = list()

#             if consider_pos_tag:
#                 for word, tag in row:
#                     wordnet_pos_tag = wordnet_pos_tagger(tag)

#                     if wordnet_pos_tag is None:
#                         lemmatized_row.append(word)
#                     else:
#                         result = lemmatizer.lemmatize(word, wordnet_pos_tag)
#                         lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

#             lemmatized.append(lemmatized_row)
#     else:
#         for row in data:
#             lemmatized_row = list()

#             for word in row:
#                 lemmatized_row.append(lemmatizer.lemmatize(word))

#             lemmatized.append(lemmatized_row)

#     return pd.Series(lemmatized)


In [30]:
# Concatenate lemmatized sentences back into one sentence
# def concatenate(data: pd.Series):
#     return data.apply(lambda words: " ".join(words))


In [31]:
# preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# # Run the pipeline
# preprocessed_data = cleaned_data.copy()

# # Process all the cleaning instructions
# for col, pipeline in preprocessing_pipeline.items():
#     # Get the column to perform cleaning on
#     temp_data = preprocessed_data[col].copy()

#     # Perform all the cleaning functions sequencially
#     for func in pipeline:
#         print(f"Starting: {func.__name__}")

#         if func.__name__ == "lemmatize":
#             temp_data = func(temp_data, consider_pos_tag=True)
#         else:
#             temp_data = func(temp_data)

#         print(f"Ended: {func.__name__}")

#     # Replace the old column with cleaned one.
#     preprocessed_data[col] = temp_data.copy()


# Feature Extraction


In [93]:
class TfIdf:
    def __init__(self) -> None:
        self.n_docs = None
        self.vocab = set()
        self.vocab_size = None
        self.vocab_index = dict()
        self.word_document_count = dict()

    def __create_vocab__(self, documents: npt.NDArray) -> Set:
        vocab = set()

        for document in documents:
            for word in document:
                vocab.add(word)

        return vocab

    def __get_word_document_count__(self, documents: npt.NDArray):
        word_document_count = dict()

        for word in self.vocab:
            word_document_count[word] = 0
            for document in documents:
                if word in document:
                    word_document_count[word] += 1

        return word_document_count

    def __term_frequency__(self, word: str, document: npt.NDArray):
        word_occurences = (document == word).sum()
        return word_occurences / self.n_docs

    def __inverse_document_frequency__(self, word: str):
        word_occurrences = 1

        if word in self.word_document_count:
            word_occurrences += self.word_document_count[word]

        return np.log(self.n_docs / word_occurrences)

    def __tf_idf__(self, document: npt.NDArray):
        tf_idf_vector = np.zeros(shape=(self.vocab_size,))
        for word in document:
            tf = self.__term_frequency__(word, document)
            idf = self.__inverse_document_frequency__(word)

            tf_idf_vector[self.vocab_index[word]] = tf * idf
        return tf_idf_vector

    def fit(self, documents: npt.NDArray):
        self.n_docs = documents.shape[0]
        self.vocab = self.__create_vocab__(documents)
        self.vocab_size = len(self.vocab)
        self.vocab_index = {word: idx for idx, word in enumerate(self.vocab)}
        self.word_document_count = self.__get_word_document_count__(documents)

    def transform(self, documents: npt.NDArray):
        tf_idf_vectors = list()
        for document in documents:
            tf_idf_vectors.append(self.__tf_idf__(document))
        return np.array(tf_idf_vectors)

    def export(self):
        return {
            "n_docs": self.n_docs,
            "vocab_size": self.vocab_size,
            "vocab": self.vocab,
            "vocab_size": self.vocab_size,
            "vocab_index": self.vocab_index,
            "self.word_document_count": self.word_document_count,
        }


In [94]:
final_data = load_data(CLEANED_DATA_FILE_PATH)


In [95]:
def tokenize(data: npt.NDArray):
    tokenized_documents = list()
    for document in data:
        tokenized_documents.append(np.array(document.split()))
    return np.array(tokenized_documents, dtype=object)


In [96]:
tokenized_documents = tokenize(final_data[:, DATA_COL])
tokenized_documents[0]


array(['if', 'you', 'are', 'looking', 'for', 'an', 'elegant', 'hotel',
       'in', 'downtown', 'chicago', 'you', 'have', 'to', 'stay', 'here',
       'the', 'ambassador', 'east', 'hotel', 'has', 'very', 'comfortable',
       'and', 'beautiful', 'large', 'rooms', 'and', 'is', 'like', 'a',
       'home', 'away', 'from', 'home', 'the', 'perfect', 'place', 'for',
       'a', 'business', 'person', 'and', 'if', 'you', 'have', 'a',
       'small', 'pet', 'you', 'can', 'bring', 'them', 'too', 'i', 'would',
       'give', 'this', 'place', 'four', 'stars', 'and', 'would',
       'definitely', 'stay', 'here', 'again'], dtype='<U11')

In [97]:
# TODO: Invoke TF-IDF
tf_idf = TfIdf()
tf_idf.fit(tokenized_documents)


In [103]:
tf_idf_vectors = tf_idf.transform(tokenized_documents)
tf_idf_vectors.shape


(960, 7655)

In [133]:
sentiment_labels = np.where(final_data[:, SENTIMENT_TARGET_COL] == POSITIVE, 1, -1)
truthfulness_labels = np.where(final_data[:, TRUTHFULNESS_TARGET_COL] == TRUTHFUL, 1, -1)


# Split Data


In [134]:
def train_test_split(data: npt.NDArray, labels: npt.NDArray, test_size: float = 0.2):
    n_max = data.shape[0]
    sample = int((1 - test_size) * n_max)

    all_idx = np.random.permutation(n_max)
    train_idx, test_idx = all_idx[:sample], all_idx[sample:]

    X_train, X_test, y_train, y_test = data[train_idx], data[test_idx], labels[train_idx], labels[test_idx]

    return X_train, X_test, y_train, y_test


In [142]:
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(
    tf_idf_vectors, sentiment_labels, TEST_SIZE
)
X_train_truthfulness, X_test_truthfulness, y_train_truthfulness, y_test_truthfulness = train_test_split(
    tf_idf_vectors, truthfulness_labels, TEST_SIZE
)


# Perceptron Models


## Simple Perceptron


In [176]:
class Perceptron:
    def __init__(self, n_features: int) -> None:
        self.weights = np.zeros(shape=(n_features,))
        self.bias = 0.0

    # TODO: Can implement this ... tolerance: float, early_stopping: bool = True
    def fit(self, X: npt.NDArray, y: npt.NDArray, max_iterations: int):
        misclassified = 0
        n_epoch = 0

        for epoch in range(max_iterations):
            n_epoch = epoch
            print(f"Epoch #: {n_epoch + 1}")

            misclassified = 0
            for idx, x in enumerate(X):
                target = y[idx]

                a = np.dot(x, self.weights) + self.bias

                if target * a <= 0:
                    self.weights += target * x
                    self.bias += target
                    misclassified += 1

            if misclassified == 0:
                break

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            a = np.sign(np.dot(x, self.weights) + self.bias)
            predictions.append(a)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"weights": self.weights, "biases": self.bias, "class_dict": self.class_dict}


### Sentiment Classification


#### Learn


In [177]:
n_features = X_train_sentiment.shape[-1]


In [178]:
clf = Perceptron(n_features)
clf.fit(X_train_sentiment, y_train_sentiment, max_iterations=100)


Epoch #: 1
Epoch #: 2
Epoch #: 3
Epoch #: 4
Epoch #: 5
Epoch #: 6
Epoch #: 7
Epoch #: 8
Epoch #: 9
Epoch #: 10
Epoch #: 11
Epoch #: 12


#### Classify


In [179]:
y_pred_sentiment = clf.predict(X_test_sentiment)


#### Score


In [186]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(clf.predict(X_train_sentiment) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       1.00      1.00      1.00       374
         Pos       1.00      1.00      1.00       394

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.96      0.94      0.95       106
         Pos       0.93      0.95      0.94        86

    accuracy                           0.95       192
   macro avg       0.95      0.95      0.95       192
weighted avg       0.95      0.95      0.95       192

---------------------------------------------------------


### Truthfulness Classification


#### Learn


In [187]:
n_features = X_train_truthfulness.shape[-1]


In [188]:
clf = Perceptron(n_features)
clf.fit(X_train_truthfulness, y_train_truthfulness, max_iterations=100)


Epoch #: 1
Epoch #: 2
Epoch #: 3
Epoch #: 4
Epoch #: 5
Epoch #: 6
Epoch #: 7
Epoch #: 8
Epoch #: 9


#### Classify


In [189]:
y_pred_truthfulness = clf.predict(X_test_truthfulness)


#### Score


In [191]:
calculate_scores(
    np.where(y_train_truthfulness == -1, "Fake", "True"),
    np.where(clf.predict(X_train_truthfulness) == -1, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == -1, "Fake", "True"),
    np.where(y_pred_truthfulness == -1, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00       379
        True       1.00      1.00      1.00       389

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.88      0.78      0.83       101
        True       0.78      0.88      0.83        91

    accuracy                           0.83       192
   macro avg       0.83      0.83      0.83       192
weighted avg       0.83      0.83      0.83       192

---------------------------------------------------------


## Averaged Perceptron


In [234]:
class AveragedPerceptron:
    def __init__(self, n_features: int) -> None:
        self.weights = np.zeros(shape=(n_features,))
        self.bias = 0.0
        self.cache = {"weights": np.zeros(shape=(n_features,)), "bias": 0.0}

    # TODO: Can implement this ... tolerance: float, early_stopping: bool = True
    def fit(self, X: npt.NDArray, y: npt.NDArray, max_iterations: int):
        misclassified = 0
        n_epoch = 0

        c = 1
        for epoch in range(max_iterations):
            n_epoch = epoch
            print(f"Epoch #: {n_epoch + 1}")

            misclassified = 0
            for idx, x in enumerate(X):
                target = y[idx]

                a = np.dot(x, self.weights) + self.bias

                if target * a <= 0:
                    self.weights += target * x
                    self.bias += target

                    self.cache["weights"] += target * c * x
                    self.cache["bias"] += target * c

                    misclassified += 1

            c += 1

            if misclassified == 0:
                self.weights -= ((1 / c) * self.cache["weights"])
                self.bias -= ((1 / c) * self.cache["bias"])
                break

    def predict(self, X: npt.NDArray):
        predictions = list()
        for x in X:
            a = np.sign(np.dot(x, self.weights) + self.bias)
            predictions.append(a)
        return np.array(predictions)

    def export(
        self,
    ):
        return {"weights": self.weights, "biases": self.bias, "class_dict": self.class_dict}


### Sentiment Classification


#### Learn


In [235]:
n_features = X_train_sentiment.shape[-1]


In [236]:
clf = AveragedPerceptron(n_features)
clf.fit(X_train_sentiment, y_train_sentiment, max_iterations=100)


Epoch #: 1
Epoch #: 2
Epoch #: 3
Epoch #: 4
Epoch #: 5
Epoch #: 6
Epoch #: 7
Epoch #: 8
Epoch #: 9
Epoch #: 10
Epoch #: 11
Epoch #: 12


#### Classify


In [237]:
y_pred_sentiment = clf.predict(X_test_sentiment)


#### Score


In [239]:
calculate_scores(
    np.where(y_train_sentiment == -1, "Neg", "Pos"),
    np.where(clf.predict(X_train_sentiment) == -1, "Neg", "Pos"),
    title="Train",
)
calculate_scores(
    np.where(y_test_sentiment == -1, "Neg", "Pos"), np.where(y_pred_sentiment == -1, "Neg", "Pos"), title="Test"
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

         Neg       1.00      1.00      1.00       374
         Pos       1.00      1.00      1.00       394

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

         Neg       0.95      0.97      0.96       106
         Pos       0.96      0.94      0.95        86

    accuracy                           0.96       192
   macro avg       0.96      0.96      0.96       192
weighted avg       0.96      0.96      0.96       192

---------------------------------------------------------


### Truthfulness Classification


#### Learn


In [240]:
n_features = X_train_truthfulness.shape[-1]


In [241]:
clf = AveragedPerceptron(n_features)
clf.fit(X_train_truthfulness, y_train_truthfulness, max_iterations=100)


Epoch #: 1
Epoch #: 2
Epoch #: 3
Epoch #: 4
Epoch #: 5
Epoch #: 6
Epoch #: 7
Epoch #: 8
Epoch #: 9


#### Classify


In [242]:
y_pred_truthfulness = clf.predict(X_test_truthfulness)


#### Score


In [243]:
calculate_scores(
    np.where(y_train_truthfulness == -1, "Fake", "True"),
    np.where(clf.predict(X_train_truthfulness) == -1, "Fake", "True"),
    title="Train",
)
calculate_scores(
    np.where(y_test_truthfulness == -1, "Fake", "True"),
    np.where(y_pred_truthfulness == -1, "Fake", "True"),
    title="Test",
)


------------------------ Train ------------------------
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00       379
        True       1.00      1.00      1.00       389

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

---------------------------------------------------------
------------------------ Test ------------------------
              precision    recall  f1-score   support

        Fake       0.88      0.74      0.81       101
        True       0.76      0.89      0.82        91

    accuracy                           0.81       192
   macro avg       0.82      0.82      0.81       192
weighted avg       0.82      0.81      0.81       192

---------------------------------------------------------
