In [1]:
import re

import json
from typing import List, Dict, Tuple, Set, Any

import numpy as np
import numpy.typing as npt


# Define Constants


In [2]:
######################################################
### Constants                                      ###
######################################################
# Base Paths
INPUT_PATH = "./data"
MODEL_PATH = "./model"

# Model File names
VANILLA_MODEL_FILENAME = "vanillamodel.txt"
AVERAGED_MODEL_FILENAME = "averagedmodel.txt"

# Class Identifiers
TRUTHFUL = "True"
DECEPTIVE = "Fake"
POSITIVE = "Pos"
NEGATIVE = "Neg"

SENTIMENT_CLASS_DICT = {1: "Pos", -1: "Neg"}

TRUTHFULNESS_CLASS_DICT = {1: "True", -1: "Fake"}

# File paths
TRAIN_FILE_PATH = f"{INPUT_PATH}/train-labeled.txt"
CLEANED_DATA_FILE_PATH = f"{INPUT_PATH}/cleaned-data.txt"
PREPROCESSED_DATA_FILE_PATH = f"{INPUT_PATH}/preprocessed-data.txt"

VANILLA_MODEL_FILE_PATH = f"{MODEL_PATH}/{VANILLA_MODEL_FILENAME}"
AVERAGED_MODEL_FILE_PATH = f"{MODEL_PATH}/{AVERAGED_MODEL_FILENAME}"

DEV_DATA_FILE_PATH = f"{INPUT_PATH}/dev-text.txt"
DEV_KEY_FILE_PATH = f"{INPUT_PATH}/dev-key.txt"

RANDOM_SEED = 42

DATA_COL = 3
SENTIMENT_TARGET_COL = 2
TRUTHFULNESS_TARGET_COL = 1
TEST_SIZE = 0.2


In [3]:
rng = np.random.default_rng(seed=RANDOM_SEED)
np.random.seed(RANDOM_SEED)


# Helper Functions


In [4]:
line = "07Zfn0z Fake Pos If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.\n"


In [5]:
input_regex = re.compile("(\w*) (\w*) (\w*) (.*)\n?")


In [6]:
re.match(input_regex, line).groups()


('07Zfn0z',
 'Fake',
 'Pos',
 "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")

In [7]:
data = []
data.append(re.match(input_regex, line).groups())
data


[('07Zfn0z',
  'Fake',
  'Pos',
  "If you're looking for an elegant hotel in downtown Chicago, you have to stay here. The Ambassador East Hotel has very comfortable and beautiful large rooms and is like a home away from home. The perfect place for a business person, and if you have a small pet you can bring them too! I would give this place four stars and would definitely stay here again.")]

In [8]:
np.array(data)[0, 1]


'Fake'

In [9]:
# Load Data
def load_data(input_file_path: str, type: str = "TRAIN") -> npt.NDArray:
    input_data = list()
    regex = "(\w*) (\w*) (\w*) (.*)\n?"
    if type == "DEV":
        regex = "(\w*) (.*)\n?"
    elif type == "KEY":
        regex = "(\w*) (\w*) (\w*)\n?"
    input_regex = re.compile(regex)
    with open(input_file_path, mode="r") as input_file:
        for line in input_file:
            input_data.append(re.match(input_regex, line).groups())
    return np.array(input_data)


In [10]:
# Store Data
def store_data(date_file_path: str, data: npt.NDArray) -> None:
    with open(date_file_path, mode="w") as data_file:
        for row in data:
            data_file.write(f"{row[0]} {row[1]} {row[2]} {row[3]}\n")


In [11]:
# Store Model
def store_model(model_file_path: str, model_data: Any) -> None:
    # TODO: Need to check model_data
    with open(model_file_path, mode="w") as model_file:
        json.dump(model_data, model_file, ensure_ascii=False)


In [12]:
# Load Model
def load_model(model_file_path: str) -> npt.NDArray:
    # TODO: Extract data from JSON and return that only
    with open(model_file_path, mode="r") as model_file:
        model_data = json.load(model_file)
    return model_data


In [13]:
# Store Predictions
def store_predictions(output_file_path: str, predictions: List[Tuple[str, str, str]]) -> None:
    with open(output_file_path, mode="w") as output_file:
        for prediction in predictions:
            output_file.write(f"{prediction[0]} {prediction[1]} {prediction[2]}\n")


In [14]:
# Calculate Scores
def calculate_scores(y_true, y_pred, title: str):
    from sklearn.metrics import classification_report

    print(f"------------------------ {title} ------------------------")
    print(classification_report(y_true, y_pred))
    print("--------------------------------------------------------------------------------\n\n")


# Load Data


In [15]:
data = load_data(TRAIN_FILE_PATH, type="TRAIN")


# Data Cleaning


In [16]:
data[:, 3].shape


(960,)

In [17]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: npt.NDArray):
    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = result[i].lower()
    return result


In [18]:
def remove_html_encodings(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"&#\d+;", " ", result[i])
    return result


In [19]:
def remove_html_tags(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])
    return result


In [20]:
def remove_url(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])
    return result


In [21]:
def remove_html_and_url(data):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (npt.NDArray): A Numpy Array of type string

    Returns:
        _type_: npt.NDArray
    """
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        # Remove HTML encodings
        result[i] = re.sub(r"&#\d+;", "", result[i])

        # Remove HTML tags (both open and closed)
        result[i] = re.sub(r"<[a-zA-Z]+\s?/?>", "", result[i])

        # Remove URLs
        result[i] = re.sub(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", "", result[i])

    return result


In [22]:
def replace_digits_with_tag(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"\d+", " NUM ", result[i])
    return result


In [23]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"_+|\\|[^a-zA-Z0-9\s]", " ", result[i])
    return result


In [24]:
# Remove extra spaces
def remove_extra_spaces(data: npt.NDArray):
    import re

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = re.sub(r"^\s*|\s\s*", " ", result[i])
    return result


In [25]:
# Install contractions package, if you don't have it
# ! pip install contractions


In [26]:
# Expanding contractions
def fix_contractions(data: npt.NDArray):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = contraction_fixer(result[i])
    return result


In [27]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [28]:
dev_raw_data = load_data(DEV_DATA_FILE_PATH, type="DEV")
dev_key_data = load_data(DEV_KEY_FILE_PATH, type="KEY")

In [29]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    1: [
        to_lower,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

dev_cleaned_data = dev_raw_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = dev_cleaned_data[:, col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    dev_cleaned_data[:, col] = temp_data.copy()


Starting: to_lower
Ended: to_lower
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [30]:
store_data(CLEANED_DATA_FILE_PATH, cleaned_data)


# Data Preprocessing


In [31]:
# Load Data
preprocessed_data = load_data(CLEANED_DATA_FILE_PATH)
preprocessed_data[0]


array(['07Zfn0z', 'Fake', 'Pos',
       ' if you are looking for an elegant hotel in downtown chicago you have to stay here the ambassador east hotel has very comfortable and beautiful large rooms and is like a home away from home the perfect place for a business person and if you have a small pet you can bring them too i would give this place four stars and would definitely stay here again '],
      dtype='<U4049')

In [32]:
def tokenize(data: npt.NDArray):
    import nltk
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    n_data = data.shape[0]
    result = list()
    for i in range(n_data):
        result.append(word_tokenize(data[i]))
    return np.array(result)


In [33]:
def remove_stopwords(data: npt.NDArray):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    import nltk
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    n_data = data.shape[0]
    result = data.copy()
    for i in range(n_data):
        result[i] = remover(result[i], stopwords)
    return result


In [34]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: npt.NDArray):
    n_data = data.shape[0]
    result = list()
    for i in range(n_data):
        result.append(" ".join(data[i]))
    return np.array(result)


In [35]:
# preprocessing_pipeline = {DATA_COL: [tokenize, remove_stopwords, concatenate]}

# # Run the pipeline
# preprocessed_data = cleaned_data.copy()

# # Process all the cleaning instructions
# for col, pipeline in preprocessing_pipeline.items():
#     # Get the column to perform cleaning on
#     temp_data = preprocessed_data[:, col].copy()

#     # Perform all the cleaning functions sequencially
#     for func in pipeline:
#         print(f"Starting: {func.__name__}")

#         temp_data = func(temp_data)

#         print(f"Ended: {func.__name__}")

#     # Replace the old column with cleaned one.
#     preprocessed_data[:, col] = temp_data.copy()


In [36]:
# store_data(PREPROCESSED_DATA_FILE_PATH, preprocessed_data)


In [37]:
# preprocessed_data[0]


# Split the Data


In [38]:
data = load_data(CLEANED_DATA_FILE_PATH)


In [39]:
# Split the data 80-20 split
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, stratify=data[:, SENTIMENT_TARGET_COL], random_state=RANDOM_SEED)


In [40]:
train.shape, test.shape


((768, 4), (192, 4))

# Feature Extraction


## TF-IDF


In [41]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)
vectorizer.fit(data[:, DATA_COL])


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Perceptron Models


## 1. Perceptron


### A. Sentiment Classifier


In [42]:
from sklearn.linear_model import Perceptron

X_tfidf_train = vectorizer.transform(train[:, DATA_COL])
X_tfidf_test = vectorizer.transform(test[:, DATA_COL])
y_train = train[:, SENTIMENT_TARGET_COL]
y_test = test[:, SENTIMENT_TARGET_COL]

clf = Perceptron(max_iter=10, alpha=1, random_state=RANDOM_SEED, tol=1e-4, early_stopping=True)  # 0.45103975891511683

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calculate_scores(y_train, clf.predict(X_tfidf_train), title="Sentiment Analysis -- Train")
calculate_scores(y_test, y_pred, title="Sentiment Analysis -- Test")


------------------------ Sentiment Analysis -- Train ------------------------
              precision    recall  f1-score   support

         Neg       1.00      1.00      1.00       384
         Pos       1.00      1.00      1.00       384

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768

--------------------------------------------------------------------------------


------------------------ Sentiment Analysis -- Test ------------------------
              precision    recall  f1-score   support

         Neg       0.90      0.96      0.93        96
         Pos       0.96      0.90      0.92        96

    accuracy                           0.93       192
   macro avg       0.93      0.93      0.93       192
weighted avg       0.93      0.93      0.93       192

--------------------------------------------------------------------------------




In [43]:
dev_tfidf_vectors = vectorizer.transform(dev_cleaned_data[:,1])

calculate_scores(dev_key_data[:,SENTIMENT_TARGET_COL], clf.predict(dev_tfidf_vectors), title="Sentiment Analysis -- Dev")


------------------------ Sentiment Analysis -- Dev ------------------------
              precision    recall  f1-score   support

         Neg       0.91      0.94      0.93       160
         Pos       0.94      0.91      0.92       160

    accuracy                           0.93       320
   macro avg       0.93      0.93      0.92       320
weighted avg       0.93      0.93      0.92       320

--------------------------------------------------------------------------------




### B. Truthfulness Classifier


In [57]:
from sklearn.linear_model import Perceptron

X_tfidf_train = vectorizer.transform(train[:, DATA_COL])
X_tfidf_test = vectorizer.transform(test[:, DATA_COL])
y_train = train[:, TRUTHFULNESS_TARGET_COL]
y_test = test[:, TRUTHFULNESS_TARGET_COL]

clf = Perceptron(max_iter=10, alpha=8e-4, random_state=RANDOM_SEED, early_stopping=True)  # 0.45103975891511683

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

calculate_scores(y_train, clf.predict(X_tfidf_train), title="Truthfulness Analysis -- Train")
calculate_scores(y_test, y_pred, title="Truthfulness Analysis -- Test")


------------------------ Truthfulness Analysis -- Train ------------------------
              precision    recall  f1-score   support

        Fake       0.98      0.98      0.98       387
        True       0.98      0.98      0.98       381

    accuracy                           0.98       768
   macro avg       0.98      0.98      0.98       768
weighted avg       0.98      0.98      0.98       768

--------------------------------------------------------------------------------


------------------------ Truthfulness Analysis -- Test ------------------------
              precision    recall  f1-score   support

        Fake       0.76      0.94      0.84        93
        True       0.92      0.72      0.81        99

    accuracy                           0.82       192
   macro avg       0.84      0.83      0.82       192
weighted avg       0.84      0.82      0.82       192

--------------------------------------------------------------------------------




In [53]:
dev_tfidf_vectors = vectorizer.transform(dev_cleaned_data[:,1])

calculate_scores(dev_key_data[:,TRUTHFULNESS_TARGET_COL], clf.predict(dev_tfidf_vectors), title="Truthfulness Analysis -- Dev")

------------------------ Truthfulness Analysis -- Dev ------------------------
              precision    recall  f1-score   support

        Fake       0.81      0.84      0.83       160
        True       0.84      0.80      0.82       160

    accuracy                           0.82       320
   macro avg       0.82      0.82      0.82       320
weighted avg       0.82      0.82      0.82       320

--------------------------------------------------------------------------------




## 2. Averaged Perceptron


In [None]:
## No Libs Available
