# **Insert Title Here**
**DATA103 S11 Group 4**
- GOZON, Jean Pauline D.
- JAMIAS, Gillian Nicole A.
- MARCELO Andrea Jean C. 
- REYES, Anton Gabriel G.
- VICENTE, Francheska Josefa

## Requirements and Imports

### Imports

**Basic Libraries**

* `numpy` contains a large collection of mathematical functions
* `pandas` contains functions that are designed for data manipulation and data analysis

In [None]:
import numpy as np
import pandas as pd
import datasets

**Natural Language Processing Libraries**
* `TFidfVectorizer` converts the given text documents into a matrix, which has TF-IDF features
* `CountVectorizer` converts the given text documents into a matrix, which has the counts of the tokens

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

**Machine Learning Libraries**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
import pickle

In [None]:
df = pd.read_csv ('cleaned_data_with_char_seq_removal.csv')
df

## Feature Engineering

### Splitting the Dataset into Train, Val, and Test Split

In [None]:
X = df ['text']
X

In [None]:
y = df ['class']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size = 0.1,
                                                  stratify = y_train,
                                                  random_state = 42, 
                                                  shuffle = True)

In [None]:
print('Train input  shape: ', X_train.shape)
print('Train output shape: ', y_train.shape)

In [None]:
print('Val input  shape: ', X_val.shape)
print('Val output shape: ', y_val.shape)

In [None]:
print('Test input  shape: ', X_test.shape)
print('Test output shape: ', y_test.shape)

In [None]:
train_df = pd.concat([X_train, y_train], axis = 1).reset_index(drop = True)
train_df

In [None]:
val_df = pd.concat([X_val, y_val], axis = 1).reset_index(drop = True)
val_df

In [None]:
test_df = pd.concat([X_test, y_test], axis = 1).reset_index(drop = True)
test_df

### Tokenizing with TF-IDF Vectorizer

We create a `TfidfVectorizer` object.

In [None]:
tfidf_vectorizer = TfidfVectorizer()

We make another vectorizer object that takes the top 5000 features.

In [None]:
tfidf_vectorizer_5000 = TfidfVectorizer(max_features = 5000)

**Train Data**

In [None]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

In [None]:
tfidf_train_5000 = tfidf_vectorizer_5000.fit_transform(X_train)

**Validation Data**

In [None]:
tfidf_val = tfidf_vectorizer.transform(X_val)

In [None]:
tfidf_val_5000 = tfidf_vectorizer_5000.transform(X_val)

**Test Data**

In [None]:
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
tfidf_test_5000 = tfidf_vectorizer_5000.transform(X_test)

### Tokenizing with Count Vectorizer

We create a `CountVectorizer` object.

In [None]:
count_vectorizer = CountVectorizer()

We make another vectorizer object that takes the top 5000 features.

In [None]:
count_vectorizer_5000 = CountVectorizer(max_features = 5000)

**Train Data**

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_train_5000 = count_vectorizer_5000.fit_transform(X_train)

**Validation Data**

In [None]:
count_val = count_vectorizer.transform(X_val)

In [None]:
count_val_5000 = count_vectorizer_5000.transform(X_val)

**Test Data**

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
count_test_500 = count_vectorizer.transform(X_test)

## Modeling and Evaluation

### Defining the Functions

In [None]:
def scores (y_true, y_pred):
    accuracy = accuracy_score(y_true = y_true, y_pred = y_pred) * 100
    f1_micro_average = f1_score(y_true = y_true, y_pred = y_pred, average = 'micro') * 100
    f1_macro_average = f1_score(y_true = y_true, y_pred = y_pred, average = 'macro') * 100
    hamming_loss_score = hamming_loss(y_true = y_true, y_pred = y_pred) * 100
    
    return accuracy, f1_micro_average, f1_macro_average, hamming_loss_score

In [None]:
def print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score):
    print('Accuracy: ', accuracy, '%')
    print('F1 Macro Average: ', f1_macro_average, '%')
    print('F1 Micro Average: ', f1_micro_average, '%')
    print('Hamming Loss: ', hamming_loss_score, '%')

In [None]:
def train_model(base_model, X_train, y_train, X_test, y_test):
    test_predictions = np.zeros((len(y_test), 1))   
                                                       
    model = base_model
    model.fit(X_train, y_train)   
    
    train_predictions = model.predict(X_train)                      
    accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_train, train_predictions)    
    print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

    test_predictions = model.predict(X_test)       
                  
    return model, test_predictions

In [None]:
def tune_and_train_model(model, hyperparameters,
                          X_train, y_train, 
                          X_test, y_test, 
                          scoring='accuracy', cv = 5):
    
    print('Tuning', str(model) + '...')
        
    model_cv = GridSearchCV(model, hyperparameters, cv = cv, scoring = scoring)
    model_cv.fit(X_train, y_train)
        
    train_predictions = model_cv.predict(X_train)                              
    accuracy = accuracy_score(train_predictions, y_train)           
        
    test_predictions = model_cv.predict(X_test)               
    
    return model_cv.best_estimator_, test_predictions

### Logistic Regression

#### Model Training 

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg, lr_test_predictions = train_model (log_reg, X_train, y_train, X_test, y_test)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, lr_test_predictions)).plot()

In [None]:
accuracy, f1_micro_average, f1_macro_average, hamming_loss_score = scores (y_test, lr_test_predictions)   
print_scores (accuracy, f1_micro_average, f1_macro_average, hamming_loss_score)

#### Hyperparameter Tuning

In [None]:
lr_hp_space = [{
    'C' : [0.01, 0.1, 1, 10],
    'max_iter' : [50, 100, 300, 600, 900, 1100] 
}]

In [None]:
log_reg = LogisticRegression()

In [None]:
lr_tuned_model, lr_tuned_test_predictions = tune_and_train_model (log_reg, lr_hp_space, 
                                                                  X_train, y_train, 
                                                                  X_test, y_test)

#### Evaluation

#### Feature Importance

### Multinomial Naive Bayes

#### Model Training 

#### Hyperparameter Tuning

#### Evaluation

#### Feature Importance

### Random Forest Classifier

#### Model Training 

#### Hyperparameter Tuning

#### Evaluation

#### Feature Importance