# Training Clause Detection Model

This folder is just for those interested in the process or wanting to retrain. For those that just want to use the pre-trained model, that is accesible at ../clause_identifier_model.pkl

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tclp.clause_detector import detector_utils as du

## Loading Synthetic Data into a Dataframe

In [None]:
modified_data_folder = '../../data/synth_data/modified_real'
untouched_data_folder = '../../data/synth_data/untouched'
modified_gen_data_folder = '../../data/synth_data/modified_gen'

In [None]:
texts, labels, contract_ids, contract_level_labels = du.load_labeled_contracts(
    modified_data_folder, modified=True
)
(
    texts_untouched,
    labels_untouched,
    contract_ids_untouched,
    contract_level_labels_untouched,
) = du.load_labeled_contracts(untouched_data_folder)
texts_gen, labels_gen, contract_ids_gen, contract_level_labels_gen = (
    du.load_labeled_contracts(modified_gen_data_folder, modified=True)
)

In [None]:
# combine texts from different sources
texts = texts + texts_untouched + texts_gen
labels = labels + labels_untouched + labels_gen
contract_ids = contract_ids + contract_ids_untouched + contract_ids_gen
contract_level_labels = (
    contract_level_labels + contract_level_labels_untouched + contract_level_labels_gen
)

In [None]:
data = du.create_and_clean_base_df(texts, labels, contract_ids, contract_level_labels)

In [None]:
data

## Load Real Clauses for Training Data

In [None]:
clause_folder = "../../data/cleaned_clauses_detect"

In [None]:
clause_texts, clause_labels, clause_ids, clause_reality = du.load_clauses(clause_folder)

In [None]:
# put this in the same form as the contract data so they can be combined; contract label for all of them is 1
clause_data = pd.DataFrame(
    {
        "contract_ids": clause_ids,
        "text": clause_texts,
        "label": clause_labels,
        "contract_label": [1] * len(clause_ids),
        "real_clause": clause_reality,
    }
)

In [None]:
clause_data

## Creating Full Dataframe

Now that my two dataframes are in the same form, I can combine them to create one data set. 

From there, I can create my training, testing, and validation data.

In [None]:
full_data = pd.concat([data, clause_data], ignore_index=True)

In [None]:
full_data

## Train Test Split

I will keep individual contracts together in either train, test, or split as well as ensure anything with the 'real_clause' positive designation is in the training set.

In [None]:
train_data, val_data, test_data, train_indices, val_indices, test_indices = (
    du.custom_train_test_split(full_data, "real_clause")
)

In [None]:
X_train, y_train = du.X_y_split(train_data)
X_val, y_val = du.X_y_split(val_data)
X_test, y_test = du.X_y_split(test_data)

In [None]:
# save the test contracts for future use
du.save_test_data(
    test_data, "../../data/synth_data/combined", "../../data/test_contracts"
)

## Training a Model

In [None]:
model = Pipeline(
    [
        ("tfidf", TfidfVectorizer(max_features=None, ngram_range=(1, 2), min_df=2)),
        (
            "logreg",
            LogisticRegression(
                max_iter=1000, class_weight="balanced", solver="liblinear", C=10
            ),
        ),
    ]
)
model.fit(X_train, y_train)

In [None]:
# save the model to disk
du.save_model(model, "../clause_identifier_model.pkl")

In [None]:
y_pred = du.evaluate_model_clause_level(model, X_val, y_val)

## Extrapolating Results to Contracts

In [None]:
val_contract_df = du.create_contract_df(X_val, val_data, y_pred)

In [None]:
val_contract_df

In [None]:
f1s, f1_thresholds = du.threshold_graphs(
    val_contract_df, thresholds=range(1, 8), metric_type="f1"
)

In [None]:
du.print_contract_classification_report(val_contract_df, f1s, f1_thresholds)

## Test Set

In [None]:
test_contract_df = du.create_contract_df(X_test, test_data, model.predict(X_test))

In [None]:
f1s, f1_thresholds = du.threshold_graphs(
    test_contract_df, thresholds=range(1, 7), metric_type="f1"
)

In [None]:
du.print_contract_classification_report(test_contract_df, f1s, f1_thresholds)