# Light-weight TMF Classifier - Field Test

### Load the Job Data

In [1]:
import pandas as pd
import numpy as np

# Read selected columns from the TMF Reference Model's speccifications
job = pd.read_csv("data/01/field_data.csv")
job.head()

Unnamed: 0,Title,Subartifact,Artifact #,email
0,RNPK1007 update,Relevant Communications,99.00.14,X
1,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,Minutes,99.00.19,
2,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,Minutes,99.00.19,
3,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,Minutes,99.00.19,
4,Total Change Order Fee,Invoices,05.04.07,


### Expecting some of the job data is not labeled

In [2]:
# Drop unlabeled data
job.dropna(subset=['Title', 'Artifact #'], axis=0, inplace=True)
job.shape

(93, 4)

## Preprocessing Unseen Data

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

from concurrent.futures import ThreadPoolExecutor
import time

# Custom transformer using NLTK PorterStemmer and tokenizer
class StemmingTransformer():
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.transform_time = None
    
    def fit(self, X, y=None):
        return self    

    def stem_text(self, text):
        return " ".join([self.stemmer.stem(token) for token in nltk.word_tokenize(text)])

    def transform(self, X):
        start_time_ = time.time()
        
        with ThreadPoolExecutor() as executor:
            X_transformed = list(executor.map(self.stem_text, X))
        
        self.transform_time = time.time() - start_time_
        return X_transformed    

# Custom transformer using NLTK lemmatizer and tokenizer
class LemmatizingTransformer():
    def __init__(self):
        self.stemmer = WordNetLemmatizer()
        self.transform_time = None
            
    def fit(self, X, y=None):
        return self

    def stem_text(self, text):
        return " ".join([self.stemmer.lemmatize(token) for token in nltk.word_tokenize(text)])

    def transform(self, X):
        start_time_ = time.time()
        X_transformed = [" ".join([self.stemmer.lemmatize(token) for token in word_tokenize(text)]) for text in X]
        self.transform_time = time.time() - start_time_
        return X_transformed

# Download required NLTK data 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
# nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
input_var  = 'Title'
target = 'Artifact #'

stemmer = StemmingTransformer()
job['stem'] = stemmer.transform(job[input_var])
print(f'Time lapsed: {stemmer.transform_time:.4f}')
print(f'{job.shape[0]} rows')
X, y = job[['stem']], job[target]

Time lapsed: 0.0560
93 rows


### Load the pre-trained Classifiers

In [5]:
from joblib import dump, load

# Model pre-trained with TMF Referende Model specificaitons
version = '0.1'
model_filename =  f"TMF_classifier_v{version}.joblib"
tmf_classifier = load(model_filename)

# Pre-trained model with GPT-augmented, synthetic training data
gpt_model_filename =  f"TMF_classifier_gpt_v{version}.joblib"
gpt_classifier = load(gpt_model_filename)

### Performance Metrics

In [6]:
def evaluate_model(model, X, y):
    preditions = model.predict(X)
    score = model.score(X, y)
    hit, miss = [], []
    for real, predicted in zip (y, preditions):
        if real == predicted:
            hit.append({'Actual': real, 'Predicted': predicted})
        else:
            miss.append({'Actual': real, 'Predicted': predicted})
    # convert to DataFrames
    hit = pd.DataFrame(hit)
    miss = pd.DataFrame(miss)
    return score, hit, miss

tmf_score, tmf_hit, tmf_miss = evaluate_model(tmf_classifier, X, y)
print(f"TMF model Score: {tmf_score:.6f}")

gpt_score, gpt_hit, gpt_miss = evaluate_model(gpt_classifier, X, y)
print(f"GPT-enhanced model Score: {gpt_score:.6f}")

TMF model Score: 0.440860
GPT-enhanced model Score: 0.419355


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

## Expending Training Data from Job

In [8]:
# pd.DataFrame({'model': ['Logistic', 'Decision Tree', 'Bayes'], 
#              'best_params': ['', '', ''],
#              'best_score': ['', '', '']}).set_index('model');

In [9]:
df = job.drop(columns=['Subartifact', 'email']).copy()

In [10]:
input_var = 'Title'
lemmatizer = LemmatizingTransformer()
df['lemma'] = lemmatizer.transform(df[input_var])
print(f'Time lapsed:{lemmatizer.transform_time}')
df.head()

Time lapsed:1.3975207805633545


Unnamed: 0,Title,Artifact #,stem,lemma
0,RNPK1007 update,99.00.14,rnpk1007 updat,RNPK1007 update
1,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,99.00.19,salix rnpk1007 ( ca25187 ) proiect statu date ...,Salix RNPK1007 ( CA25187 ) Proiect Status Date...
2,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,99.00.19,salix rnpk1007 ( ca25187 ) proiect statu date ...,Salix RNPK1007 ( CA25187 ) Proiect Status Date...
3,Salix RNPK1007 (CA25187) Proiect Status\nDate ...,99.00.19,salix rnpk1007 ( ca25187 ) proiect statu date ...,Salix RNPK1007 ( CA25187 ) Proiect Status Date...
4,Total Change Order Fee,05.04.07,total chang order fee,Total Change Order Fee


## Load and Tokenize Base Training Data

In [11]:
# Read selected columns from the TMF Reference Model's speccifications
labeled_data_file = "data/labeled_training_sets.xlsx"
base_training_sets = ['TMF 3.3', 'GPT-1-5']
labeled_data = pd.DataFrame()

for training_set in base_training_sets:
    labeled_data = pd.concat([labeled_data, 
                              pd.read_excel(labeled_data_file, 
                                            sheet_name=training_set,
                                            usecols="B, C",header=0,)]
                            )
labeled_data

Unnamed: 0,Artifact #,Subartifact Title
0,01.01.01,document transfer documentation
1,01.01.01,evidence of quality review
2,01.01.01,request to lock tmf
3,01.01.01,trial master file plan
4,01.01.01,trial master file index
...,...,...
89,11.03.09,study-level submission dataset
90,11.04.01,interim statistical summary
91,11.04.01,mid-trial statistical report
92,11.04.02,final statistical summary


In [12]:
labeled_data.rename(columns={'Subartifact Title': 'Title'}, inplace=True)
input_var  = 'Title'
target = 'Artifact #'

stemmer = StemmingTransformer()
labeled_data['stem'] = stemmer.transform(labeled_data[input_var])
print(f'Stemminig time lapsed: {stemmer.transform_time:.4f}')

# lemmatizer = LemmatizingTransformer()
# labeled_data['lemma'] = lemmatizer.transform(labeled_data[input_var])
# print(f'Lemmatization time lapsed:{lemmatizer.transform_time:4f}')

labeled_data.head()
print(f'{labeled_data.shape[0]} rows')

X_base, y_base = labeled_data[['stem']], labeled_data[target]

column_name = 'stem'
target = 'Artifact #'

df.dropna(inplace=True)
X, y = df[[column_name]], df[target]

# Not emough training data for grid_search
# X_base = pd.concat([X_base, X_base, X_base, X_base, X_base])
# y_base = pd.concat([y_base, y_base, y_base, y_base, y_base])

Stemminig time lapsed: 0.1258
624 rows


Unnamed: 0,stem
0,rnpk1007 updat
1,salix rnpk1007 ( ca25187 ) proiect statu date ...
2,salix rnpk1007 ( ca25187 ) proiect statu date ...
3,salix rnpk1007 ( ca25187 ) proiect statu date ...
4,total chang order fee
...,...
91,transfer of regulatori oblig
92,trial-specif sop plan
93,key vendor personnel approv
94,certif of liabil insur


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

my_stop_words = ['a', 'of', 'and', 'for', 'to', 
                 'document', 'documentation', 'plan', 'letter', 'form',
                 'information'
                ]  

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), 
                                  stop_words=stopwords.words('english'),
                                  lowercase=True), 'stem')
    ],
    remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
     ])


score = []
# Run search on stemmed and lemmatized tokens
for sampling_rate in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
    # Use differnt sampling rate each time
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        train_size=sampling_rate,
                                                        random_state=32
                                                       )

    # Add labeled base training data
    X_train = pd.concat([X_base, X_train], axis=0, ignore_index=True)
    y_train = pd.concat([y_base, y_train], axis=0)
    best_model = pipeline.fit(X_train, y_train)
    test_score = best_model.score(X_test, y_test)
    score.append({'Sampling Rate': sampling_rate, 'Test Score': test_score})
    print(f"Sampling rate = {sampling_rate:.2f}, Accuracy score = {test_score:.2f}")
score = pd.DataFrame(score)
score
# # Missed predictions
# if pre_train_score < 1:
#     print("Missed Predictions:")
#     preditions = best_model.predict(X)
#     missed = []
#     for real, predicted in zip (y, preditions):
#         if real != predicted:
#             missed.append({'Actual': real, 'Predicted': predicted})

#     missed = pd.DataFrame(missed)
#     print(missed)

Sampling rate = 0.10, Accuracy score = 0.56
Sampling rate = 0.20, Accuracy score = 0.63
Sampling rate = 0.30, Accuracy score = 0.65
Sampling rate = 0.40, Accuracy score = 0.71
Sampling rate = 0.50, Accuracy score = 0.72
Sampling rate = 0.60, Accuracy score = 0.79
Sampling rate = 0.70, Accuracy score = 0.68
Sampling rate = 0.80, Accuracy score = 0.68


Unnamed: 0,Sampling Rate,Test Score
0,0.1,0.559524
1,0.2,0.626667
2,0.3,0.651515
3,0.4,0.714286
4,0.5,0.723404
5,0.6,0.789474
6,0.7,0.678571
7,0.8,0.684211
