In [100]:
"""This program trains a sentiment analyis model (logistic regression or SVM)
using TFIDF features. It stores the model, along with some information from
its training and some metrics, in the "models" folder of this project's
directory. There are functions to preprocess text, extract features, train the
model, and to save them to a file.
"""

# Modules for saving objects.
import pickle
import joblib

# These are used for reading training data.
import csv

# These are some modules used for preprocessing.
from contractions import fix
import re
from unidecode import unidecode
from nltk.stem.porter import PorterStemmer

# These modules is for extracting features from the data.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

# These modules are for training the ML model.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from datetime import datetime
def now():      # Used by model timelog
    return datetime.now().strftime(r'%H:%M:%S.%f')

#These modules are used for measuring the model's performance.
import sklearn
from pandas import Series

STOPWORDS = [
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
    'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been',
    'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't",
    'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't",
    'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from',
    'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
    'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself',
    'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've",
    'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's",
    'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of',
    'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours',
    'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd",
    "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than',
    'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
    'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
    "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up',
    'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were',
    "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which',
    'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would',
    "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
    'yourself', 'yourselves'
    ]

# Placeholders to test my code.
MODEL = LogisticRegression

# Replace with a get_corpus function that takes in a csv and columns for
# documents and labels and reads them to a list of tuples:
# [(document, label),...]. Access list of documents using
# (p[0] for p in corpus) and list of labels using (p[1] for p in corpus)
corpus = [
    ("Hey! I'm Marcell. Nice to meet you.", 0),
    ("I love my cat very much. He's a cool guy.", 1),
    ("I love my girlfriend very much. She has pretty brown eyes and skin. Plus, her bottom is gargantuan! I want to put her on https://www.lightskinned.com.", 1),
    ("Coding is crazy. I've been here for like 2 days straight. Shoutout to José Gonzalez.", 0)
    ]

# I want a to collect a dictionary like this:
#   {
# 'corpus': str(corpus name), 'size': int(training size),
# 'vectorizer': str(vectorizer), 'time': datetime(time trained)
# }.
model_info = {
    'corpus': str(corpus name), 'size': int(training size),
    'vectorizer': str(vectorizer), 'time': datetime(time trained)
    }

In [101]:
def save_object(object, path: str, serializer=joblib) -> None:
    """Takes an object (like a model or vectorizer) to save, a path to save it
    to, and a serialization module, and saves the object to disk to be used in
    the future. Both pickle and joblib seem to work, but apparently joblib
    works better for objects like large numpy arrays like models/vectorizers.
    """
    if serializer == joblib:
        joblib.dump(object, path)

    elif serializer == pickle:
        with open(path, 'wb') as f:     # It's a binary file, not text.
            pickle.dump(object, f)

    else: raise Exception("Serializer must be joblib or pickle.")

In [102]:
def preprocess(texts: list) -> list:
    """Takes in a list (corpus) of strings and returns a list of the processed
    versions of the strings.
    """
    temp = []   # A list that will fill with processed documents.
    for s in texts:
        
        # Normalize diacritics, contractions, and slang.
        # The use of contractions and slang could be important, so maybe don't
        # use contractions.fix, and make the unidecode slang argument False.
        # However, if contractions are used, you might need to add various
        # apostrophe forms to the last substitution filter: re.sub('[^'...
        s = fix(unidecode(s))
        
        # Remove the links and @mentions with regular expression substitutions.
        s = re.sub(r'@\w+', '', s)    # Removes @users (\w is a-Z, 0-9, _).
        s = re.sub(r'http\S+', '', s)
        s = re.sub(r'www\.\S+', '', s)    # Removes links.
        s = re.sub(r'\s+', ' ', s)  # Removes extra spaces
        s = re.sub(r'[^\w\s#]', '', s)    # Removes all but abc123... and '#'.

        # Stem the words. PorterStemmer.stem() also makes lowercase.
        stemmer = PorterStemmer()
        stemmed_words = []      # A list that will fill with stemmed words
        for w in s.split():     # on a document level.
            if w.lower() not in STOPWORDS:  # Only adds words if meaningful.
                stemmed_words.append(stemmer.stem(w))
        s = ' '.join(stemmed_words)

        temp.append(s)  # Add the processed document to the new corpus.
    return temp

In [103]:
# Both of these functions are pretty sketchy. They both have pretty short
# equivalents and kind of just play with the syntax of the corresponding lines
# of code. I should probably use their equivalents (listed in the first line
# of each one's docstring) instead.

def fit_vectorizer(corpus: list, vectorizer_type=TfidfVectorizer): # I need to get this to the model trainer to save.
    """Consider using this equivalent instead:
    vectorizer_type().transform(corpus).
    \n
    Takes in a preprocessed list of strings and fits the specified type of
    vectorizer to it. This returns the fitted vectorizer.
    """
    vectorizer = vectorizer_type()  # Create a new instance of the chosen type.
    return vectorizer.fit(corpus)

def get_features(corpus: list, vectorizer) -> csr_matrix:
    """Consider using this equivalent instead: vectorizer.transform(corpus).
    \n
    Takes in a preprocessed list of strings and extracts each ones's
    features using the fitted vectorizer passed in. It returns a scipy
    compressed sparse row matrix.
    """
    return vectorizer.transform(corpus)

In [104]:
def predict(model, x: csr_matrix | list) -> list:
    """Takes in model to be used and a matrix of the corpus' features. The
    features must have come from the same vectorizer used to train the model
    """
    return model.predict(x)

In [105]:
def train_test(model, x: csr_matrix | list, y: list,
               test_model=None, return_extras: bool = False
               ):

    """Takes in the class of the model to be used, a matrix of the corpus'
    features, and a list of the associated given labels. The matrix x can be
    sparse or dense eg. DataFrame, numpy.arry, etc. It splits the data
    into a set that will train the chosen model and a set that will be used
    to test it.
    \n
    The model is trained and then saved in the project directory's models
    folder along with a timelog of the training, an html table of its testing
    stats, plots of its confusion matrices, and the fitted vectorizer used.
    By default, this function only returns the trained model, but with
    return_extras set to True, it returns the timelog, table, and plots as
    well, and saves nothing to disk.
    \n
    With a test_model passed, the x and y passed in will not be used to
    retrain the model but instead solely to test it.
    """
    if test_model is None:
        time_log = [f'{model_info} Time Log:\n\n']   # Initialize the log to be filled over time.
        time_log.append(f'Began at                  {now()}')
        model = model()     # Instantiate the model
        time_log.append(f'Model istantiated at      {now()}')
        x_train, x_test, y_train, y_test = train_test_split(x, y)
        time_log.append(f'Train and test split at   {now()}')
        model.fit(x_train, y_train)
        time_log.append(f'Model fit at              {now()}')
        time_log = '\n'.join(time_log)  # Make it a readably formatted text.

    else:
        model = test_model
        x_test, y_test = x, y

    preds = predict(model, x_test)

    
    # There are three ways to do this. I can make it with
    # ConfusionMatrixDisplay.from_estimator(model, x_test, y_test). This is
    # not a good idea because it requires extra computation of using the model
    # again in case it's already made predictions. I can also make it using
    # ConfusionMatrixDisplay.from_predictions(y_test, preds). This skips the
    # intermediate step of actually generating the (numpy) matrix and goes
    # straight to a display. The third is to generate a matrix first using
    # sklearn.metrics.confusion_matrix(y_test, pred) and then
    # sklearn.metrics.ConfusionMatrixDisplay() with that matrix as the arg.
    matrix_display = sklearn.metrics.ConfusionMatrixDisplay.from_predictions(
        y_test, preds
        )
    matrix_display.ax_.set_title(model_info)

    # Get some stats of the performance stats of the model.
    def metrics(y_true, y_pred):
        accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
        precision = sklearn.metrics.precision_score(y_true, y_pred)
        recall = sklearn.metrics.recall_score(y_true, y_pred)
        f1 = sklearn.metrics.f1_score(y_true, y_pred)
        return accuracy, precision, recall, f1
    
    # Make a pandas Series with the scores, then turn it into an html table
    # (via DataFrame). The html data is not written to disk, but instead
    # becomes a string of html source code (metrics_html). A line of html that
    # contains a heading (<h3>) with some information about the training
    # process is inserted into the top of the string, so that the html file
    # has a description and, under it, a table of scores.
    metrics_ser = Series(metrics(y_test, preds),
                        index=['Accuracy', 'Precision', 'Recall', 'F1'],
                        name='Score'
                        )
    metrics_html = metrics_ser.to_frame().to_html()
    metrics_html = f'<h3>{model_info} Scores</h3>\n{metrics_html}'

    goodies = [
        (metrics_html, 'metrics.html'), ()
        ]
    


In [106]:
def start(
    mode: str, data: list[tuple[str, int]] | list[str], vectorizer, model
    ) -> None:
    """Starts the process of fitting, training, testing, whatever. The
    specified mode ([insert modes here]) determines which process(es) it does.
    \n
    The modes are:
    'fitv' to fit the vectorizer, 
    'train' to train the model, 
    'test' to test the model, 
    'predict' to get predictions from the data, 
    
    You can set use an argument for data that is a list of tuples
    (document, label) for a model training mode or testing, or a list of
    documents for a model prediction mode.
    You can select a vectorizer to use (string 'tfidf' or 'bow' if in a
    vectorizer fitting mode, or a fitted vectorizer object if not), a model to
    use (string 'nb', 'lr', or 'svm' if in a model fitting mode, or a trained
    model object if not)
    """
    mode = mode.lower()
    if mode = 'fitv'
vec = fit_vectorizer(preprocess(corpus))
features = get_features((d[0] for d in corpus), vec)
train_test(LogisticRegression, features, (d[1] for d in corpus))

  _warn_prf(average, modifier, msg_start, len(result))
