# **Insert Title Here**
**DATA103 S11 Group 4**
- GOZON, Jean Pauline D.
- JAMIAS, Gillian Nicole A.
- MARCELO Andrea Jean C. 
- REYES, Anton Gabriel G.
- VICENTE, Francheska Josefa

## Requirements and Imports

### Imports

**Basic Libraries**

* `numpy` contains a large collection of mathematical functions
* `pandas` contains functions that are designed for data manipulation and data analysis

In [None]:
import numpy as np
import pandas as pd
import datasets

**Natural Language Processing Libraries**
* `TFidfVectorizer` converts the given text documents into a matrix, which has TF-IDF features
* `CountVectorizer` converts the given text documents into a matrix, which has the counts of the tokens

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

**Machine Learning Libraries**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
import pickle

In [None]:
df = pd.read_csv ('cleaned_data_with_char_seq_removal.csv')
df

## Feature Engineering

In [None]:
main_directory = './saved_models/trad_ml/vectorizers/'

In [None]:
def save_vectorizers (vectorizer, vectorizer_name):
    vectorizer_filename = main_directory + vectorizer_name + '.pkl'
    
    with open(vectorizer_filename, 'wb') as file:
        pickle.dump(vectorizer, file)

### Splitting the Dataset into Train, Val, and Test Split

In [None]:
X = df ['text']
X

In [None]:
y = df ['class']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size = 0.1,
                                                  stratify = y_train,
                                                  random_state = 42, 
                                                  shuffle = True)

In [None]:
print('Train input  shape: ', X_train.shape)
print('Train output shape: ', y_train.shape)

In [None]:
print('Val input  shape: ', X_val.shape)
print('Val output shape: ', y_val.shape)

In [None]:
print('Test input  shape: ', X_test.shape)
print('Test output shape: ', y_test.shape)

In [None]:
train_df = pd.concat([X_train, y_train], axis = 1).reset_index(drop = True)
train_df

In [None]:
val_df = pd.concat([X_val, y_val], axis = 1).reset_index(drop = True)
val_df

In [None]:
test_df = pd.concat([X_test, y_test], axis = 1).reset_index(drop = True)
test_df

### Tokenizing with TF-IDF Vectorizer

We create a `TfidfVectorizer` object.

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
save_vectorizers(tfidf_vectorizer, 'tfidf')

We make another vectorizer object that takes the top 5000 features.

In [None]:
tfidf_vectorizer_5000 = TfidfVectorizer(max_features = 5000)

In [None]:
save_vectorizers(tfidf_vectorizer_5000, 'tfidf_5000')

**Train Data**

In [None]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

In [None]:
tfidf_train_5000 = tfidf_vectorizer_5000.fit_transform(X_train)

**Validation Data**

In [None]:
tfidf_val = tfidf_vectorizer.transform(X_val)

In [None]:
tfidf_val_5000 = tfidf_vectorizer_5000.transform(X_val)

**Test Data**

In [None]:
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_test_5000 = tfidf_vectorizer_5000.transform(X_test)

### Tokenizing with Count Vectorizer

We create a `CountVectorizer` object.

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
save_vectorizers(count_vectorizer, 'count')

We make another vectorizer object that takes the top 5000 features.

In [None]:
count_vectorizer_5000 = CountVectorizer(max_features = 5000)

In [None]:
save_vectorizers(count_vectorizer_5000, 'count_5000')

**Train Data**

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_train_5000 = count_vectorizer_5000.fit_transform(X_train)

**Validation Data**

In [None]:
count_val = count_vectorizer.transform(X_val)

In [None]:
count_val_5000 = count_vectorizer_5000.transform(X_val)

**Test Data**

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
count_test_500 = count_vectorizer.transform(X_test)

## Modeling and Evaluation

### Defining the Functions

In [None]:
main_directory = './saved_models/trad_ml/'

In [None]:
def save_models (model, model_name, vectorizer_name):
    curr_directory = main_directory + model_name + '/' + vectorizer_name + '/'
    
    model_filename = curr_directory + 'model' + '.pkl'
    
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)

In [None]:
def scores (y_true, y_pred):
    accuracy = accuracy_score(y_true = y_true, y_pred = y_pred) * 100
    f1_micro_average = f1_score(y_true = y_true, y_pred = y_pred, average = 'micro') * 100
    f1_macro_average = f1_score(y_true = y_true, y_pred = y_pred, average = 'macro') * 100
    hamming_loss_score = hamming_loss(y_true = y_true, y_pred = y_pred) * 100
    
    return accuracy, f1_micro_average, f1_macro_average, hamming_loss_score

In [None]:
def print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score):
    print('Accuracy: ', accuracy, '%')
    print('F1 Macro Average: ', f1_macro_average, '%')
    print('F1 Micro Average: ', f1_micro_average, '%')
    print('Hamming Loss: ', hamming_loss_score, '%')

In [None]:
def train_model(base_model, X_train, y_train, X_test, y_test, model_name, vectorizer_name):
    test_predictions = np.zeros((len(y_test), 1))   
                                                       
    model = base_model
    model.fit(X_train, y_train)   
    
    train_predictions = model.predict(X_train)                      
    accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_train, train_predictions)    
    print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

    test_predictions = model.predict(X_test)       
    
    save_models(model, model_name, vectorizer_name)
    
    return model, test_predictions

In [None]:
def tune_and_train_model(model, hyperparameters,
                         X_train, y_train, 
                         X_test, y_test, 
                         scoring='accuracy', cv = 5, 
                         model_name, vectorizer_name):
    
    print('Tuning', str(model) + '...')
        
    model_cv = GridSearchCV(model, hyperparameters, cv = cv, scoring = scoring, n_jobs = -1)
    model_cv.fit(X_train, y_train)
        
    train_predictions = model_cv.predict(X_train)                              
    accuracy = accuracy_score(train_predictions, y_train)           
        
    test_predictions = model_cv.predict(X_test)               
    
    save_models(model_cv.best_estimator_, model_name, vectorizer_name)
    
    return model_cv.best_estimator_, test_predictions

### Declaration of Hyperparameter Space

In [None]:
lr_hp_space = [{
    'C' : [0.01, 0.1, 1, 10],
    'max_iter' : [50, 100, 300, 600, 900, 1100] 
}]

In [None]:
mnb_hp_space = [{
    'alpha' : [0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'fit_prior' : [True, False]
}]

In [None]:
rf_hp_space = [{
    'criterion' : ['gini', 'entropy'],
    'n_estimators' : [50, 100, 150],
    'min_samples_split' : [2, 4, 10, 15],
    'max_depth' : [None, 50, 100, 150],
    'max_leaf_nodes' : [None, 50, 75, 100]
}]

### Logistic Regression (TF-IDF Vectorizer)

#### Model Training 

In [None]:
log_reg = LogisticRegression(n_jobs = -1)

In [None]:
log_reg_tfidf, lr_test_predictions_tfidf = train_model (log_reg, 
                                                        tfidf_train, y_train, 
                                                        tfidf_test, y_test, 
                                                        'logreg', 'tfidf')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, lr_test_predictions_tfidf)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, lr_test_predictions_tfidf)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
log_reg = LogisticRegression(n_jobs = -1)

In [None]:
lr_tuned_model_tfidf, lr_tuned_test_predictions_tfidf = tune_and_train_model (log_reg, lr_hp_space, 
                                                                              tfidf_train, y_train, 
                                                                              tfidf_test, y_test,
                                                                              'logreg', 'tfidf')

In [None]:
lr_tuned_model_tfidf

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, lr_tuned_test_predictions_tfidf)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, lr_test_predictions_tfidf)    

temp_scores = {
    'Model' : 'Logistic Regression',
    'Vectorizer' : 'TF-IDF Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

### Logistic Regression (Count Vectorizer)

#### Model Training 

In [None]:
log_reg = LogisticRegression(n_jobs = -1)

In [None]:
log_reg_count, lr_test_predictions_count = train_model (log_reg, 
                                                        count_train, y_train, 
                                                        count_test, y_test, 
                                                        'logreg', 'count')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, lr_test_predictions_count)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, lr_test_predictions_count)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
log_reg = LogisticRegression(n_jobs = -1)

In [None]:
lr_tuned_model_count, lr_tuned_test_predictions_count = tune_and_train_model (log_reg, lr_hp_space, 
                                                                              count_train, y_train, 
                                                                              count_test, y_test,
                                                                              'logreg', 'count')

In [None]:
lr_tuned_model_count

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, lr_tuned_test_predictions_count)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, lr_test_predictions_count)    

temp_scores = {
    'Model' : 'Logistic Regression',
    'Vectorizer' : 'Count Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

### Multinomial Naive Bayes (TF-IDF Vectorizer)

#### Model Training 

In [None]:
multinomial_nb = MultinomialNB ()

In [None]:
mnb_tfidf, mnb_test_predictions_tfidf = train_model (multinomial_nb, 
                                                     tfidf_train, y_train, 
                                                     tfidf_test, y_test, 
                                                     'mnb', 'tfidf')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, mnb_test_predictions_tfidf)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, mnb_test_predictions_tfidf)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
multinomial_nb = MultinomialNB ()

In [None]:
mnb_tuned_model_tfidf, mnb_tuned_test_predictions_tfidf = tune_and_train_model (multinomial_nb, mnb_hp_space, 
                                                                                tfidf_train, y_train, 
                                                                                tfidf_test, y_test,
                                                                                'mnb', 'tfidf')

In [None]:
mnb_tuned_model_tfidf

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, mnb_tuned_test_predictions_tfidf)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, mnb_test_predictions_tfidf)    

temp_scores = {
    'Model' : 'Multinomial Naive Bayes',
    'Vectorizer' : 'TF-IDF Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

### Multinomial Naive Bayes (Count Vectorizer)


#### Model Training 

In [None]:
multinomial_nb = MultinomialNB ()

In [None]:
mnb_count, mnb_test_predictions_count = train_model (multinomial_nb, 
                                                     count_train, y_train, 
                                                     count_test, y_test, 
                                                     'mnb', 'count')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, mnb_test_predictions_count)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, mnb_test_predictions_count)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
multinomial_nb = MultinomialNB ()

In [None]:
mnb_tuned_model_count, mnb_tuned_test_predictions_count = tune_and_train_model (multinomial_nb, mnb_hp_space, 
                                                                                count_train, y_train, 
                                                                                count_test, y_test,
                                                                                'mnb', 'count')

In [None]:
mnb_tuned_model_count

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, mnb_tuned_test_predictions_count)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, mnb_test_predictions_count)    

temp_scores = {
    'Model' : 'Multinomial Naive Bayes',
    'Vectorizer' : 'Count Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

### Random Forest Classifier (TF-IDF Vectorizer)

#### Model Training 

In [None]:
rf_classifier = RandomForestClassifier(n_jobs = -1)

In [None]:
rf_tfidf, rf_test_predictions_tfidf = train_model (rf_classifier,
                                                   tfidf_train, y_train, 
                                                   tfidf_test, y_test, 
                                                   'rf', 'tfidf')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, rf_test_predictions_tfidf)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, rf_test_predictions_tfidf)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
rf_classifier = RandomForestClassifier(n_jobs = -1)

In [None]:
rf_tuned_model_tfidf, rf_tuned_test_predictions_tfidf = tune_and_train_model (rf_classifier, rf_hp_space, 
                                                                              tfidf_train, y_train,
                                                                              tfidf_test, y_test,
                                                                              'rf', 'tfidf')

In [None]:
rf_tuned_model_tfidf

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, rf_tuned_test_predictions_tfidf)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, rf_test_predictions_tfidf)    

temp_scores = {
    'Model' : 'Random Forest Classifier',
    'Vectorizer' : 'TF-IDF Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

### Random Forest Classifier (Count Vectorizer)

#### Model Training 

In [None]:
rf_classifier = RandomForestClassifier(n_jobs = -1)

In [None]:
rf_count, rf_test_predictions_count = train_model (rf_classifier,
                                                   count_train, y_train, 
                                                   count_test, y_test, 
                                                   'rf', 'count')

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, rf_test_predictions_count)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, rf_test_predictions_count)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
rf_classifier = RandomForestClassifier(n_jobs = -1)

In [None]:
rf_tuned_model_count, rf_tuned_test_predictions_count = tune_and_train_model (rf_classifier, rf_hp_space, 
                                                                              count_train, y_train,
                                                                              count_test, y_test,
                                                                              'rf', 'count')

In [None]:
rf_tuned_model_count

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, rf_tuned_test_predictions_count)).plot()

#### Evaluation

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, rf_test_predictions_count)    

temp_scores = {
    'Model' : 'Random Forest Classifier',
    'Vectorizer' : 'Count Vectorizer',
    'Accuracy' : accuracy,
    'F1 Micro Average' : f1_micro_average,
    'F1 Macro Average' : f1_macro_average,
    'Hamming Loss' : hamming_loss_score
}  

scores_list.append(temp_scores)

print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Feature Importance

## Summary of Model Scores

In [None]:
pd.DataFrame(scores_list).sort_values(['Accuracy', 'F1 Micro Average', 'F1 Macro Average', 'Hamming Loss'], ascending = False).reset_index(drop = True)