# Model & Evaluation

## 1. Load Packages and Datasets

In [1]:
import json
import os
import re
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML
import time
from sklearn.metrics import accuracy_score

from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, PatternRecognizer, Pattern, RecognizerResult
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from sklearn.model_selection import GridSearchCV
#from presidio_analyzer.predefined_recognizers import EmailRecognizer, UrlRecognizer, PhoneRecognizer

from tqdm.auto import tqdm
from dateutil import parser

In [48]:
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

NameError: name 'warnings' is not defined

In [2]:
# Load pre-processed train and test set from EDA notebook
train_df = pd.read_json('/Users/Kayan/Desktop/ADS-599-Capstone-Project/data/preprocess_train.json')
test_df = pd.read_json('/Users/Kayan/Desktop/ADS-599-Capstone-Project/data/preprocess_test.json')

raw_train_df = pd.read_json('/Users/Kayan/Desktop/ADS-599-Capstone-Project/data/train.json')
raw_test_df = pd.read_json('/Users/Kayan/Desktop/ADS-599-Capstone-Project/data/test.json')

## 2. Model Building

### 2.1 Topic Model

In [6]:
# Sentiment analysis
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Define a function to perform sentiment analysis on a single piece of text
def analyze_sentiment(text):
    # Get sentiment scores
    scores = sid.polarity_scores(text)

    # Classify sentiment
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to your text data
train_df['sentiment'] = train_df['full_text'].apply(analyze_sentiment)
test_df['sentiment'] = test_df['full_text'].apply(analyze_sentiment)

# Print some sample results
print("Sample results from training data:")
print(train_df[['full_text', 'sentiment']].head())

print("\nSample results from test data:")
print(test_df[['full_text', 'sentiment']].head())

NameError: name 'nltk' is not defined

In [7]:
# NMF Model
corpus_train = train_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))
corpus_test = test_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))
corpus = pd.concat([corpus_train, corpus_test], ignore_index=True)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_X = tfidf_vectorizer.fit_transform(corpus)

# Count Vectorizer
count_vectorizer = CountVectorizer(max_features=1000)
count_X = count_vectorizer.fit_transform(corpus)

# Initialize and fit NMF model
num_topics = 5
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_X)

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1]
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print(" %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))


display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())


Topic 00
 launch (2.04)
 learning (1.69)
 team (1.22)
 product (0.97)
 customers (0.82)

Topic 01
 storytelling (1.88)
 story (1.86)
 people (1.17)
 stories (1.12)
 audience (0.60)

Topic 02
 mind (3.08)
 mapping (2.43)
 ideas (1.18)
 map (1.17)
 tool (0.86)

Topic 03
 graphic (2.91)
 visualization (1.92)
 group (1.77)
 problem (1.66)
 straw (1.35)

Topic 04
 students (9.79)
 student (2.16)
 school (2.15)
 teachers (1.56)
 class (1.37)


In [9]:
#LSA Model
lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
lsa_model.fit(tfidf_X)

# Display topics
display_topics(lsa_model, tfidf_vectorizer.get_feature_names_out())


Topic 00
 tool (0.64)
 team (0.59)
 mind (0.59)
 learning (0.50)
 design (0.50)

Topic 01
 story (42.47)
 storytelling (41.01)
 stories (27.16)
 people (18.62)
 telling (12.54)

Topic 02
 mind (85.48)
 mapping (66.68)
 map (33.79)
 graphic (26.18)
 ideas (25.05)

Topic 03
 graphic (18.38)
 visualization (9.30)
 group (8.78)
 straw (8.70)
 man (7.74)

Topic 04
 students (56.99)
 learning (19.50)
 launch (14.99)
 school (12.87)
 student (12.75)


In [8]:
# LDA Model
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(count_X)

# Display topics
display_topics(lda_model, count_vectorizer.get_feature_names_out())


Topic 00
 storytelling (1.91)
 story (1.83)
 people (1.80)
 tool (1.07)
 one (0.94)

Topic 01
 visualization (2.11)
 process (1.60)
 tool (1.38)
 team (1.35)
 would (1.04)

Topic 02
 mind (4.53)
 mapping (3.24)
 ideas (2.01)
 tool (1.89)
 design (1.82)

Topic 03
 group (3.27)
 problem (2.96)
 graphic (2.61)
 insights (2.05)
 identify (1.74)

Topic 04
 learning (2.32)
 launch (2.17)
 team (1.66)
 product (1.35)
 customers (1.24)


### 2.2 Random Forest Model

In [3]:
# Convert list of tokens back to strings
train_df['tokens_joined'] = train_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))

# Extract features and labels
X = train_df['tokens_joined']
y = train_df['labels_processed']

mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y)

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X_tfidf, y_bin, test_size=0.2, random_state=599)

In [22]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=599)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_classifier.predict(X_valid)

# Evaluate the model
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00        12
           2       1.00      0.01      0.01       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       1.00      0.78      0.88      1744
   macro avg       0.15      0.08      0.08      1744
weighted avg       0.89      0.78      0.78      1744
 samples avg       1.00      0.91      0.93      1744



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3 KNN Model

In [10]:
knn_classifier = KNeighborsClassifier()

# Record start time
start_time = time.time()

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_knn = knn_classifier.predict(X_valid)

# Record end time
end_time = time.time()

# Calculate runtime
runtime = end_time - start_time
print("Runtime:", runtime, "seconds")

# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred_knn)
print("Accuracy:", accuracy)

# Evaluate the model
print(classification_report(y_valid, y_pred_knn))

Runtime: 27.638978242874146 seconds
Accuracy: 0.8531571218795888
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      0.08      0.15        12
           2       0.38      0.04      0.08       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.22      0.02      0.04       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       0.98      0.79      0.87      1744
   macro avg       0.20      0.09      0.10      1744
weighted avg       0.85      0.79      0.79      1744
 samples avg   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
param_grid_knn = {
    'n_neighbors': range(1, 11, 1), 
    'leaf_size': range(20, 41, 1),   
    'p': [1, 2],                       
    'weights': ['uniform', 'distance'],  
    'metric': ['minkowski', 'chebyshev']  
}
start_time_knn = time.time()

# Instantiate GridSearchCV
grid_search = GridSearchCV(knn_classifier, param_grid_knn, cv=5, scoring='accuracy')

# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
final_knn_model = KNeighborsClassifier(**best_params)

# Train the model on the training data
final_knn_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_knn_param = final_knn_model.predict(X_valid)

end_time_knn = time.time()

In [None]:
# Evaluate the model
runtime_knn_param = end_time_knn - start_time_knn
print("Model runtime:", runtime_knn_param, "seconds")
evaluate(y_valid, y_pred_knn_param)

### 2.5 BERT with MS Presidio Model

In [20]:
#Only using TRAIN Data and splitting in 80/20

train = pd.read_json('./../data/train.json')
#test = pd.read_json('./../data/test.json')

preprocessed_train = pd.read_json('./../data/preprocess_train.json')
#preprocessed_test = pd.read_json('./../data/preprocess_test.json')

In [21]:
# Functions
def token_index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind

def find_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left

def count_whitespaces(word):
    return len(word) - len(word.rstrip())

def date_check(text):
    try:
        parsed_date = parser.parse(text)
        return True
    except:
        return False
    

def pii_fbeta_score(pred_df, gt_df,beta=5):
    df = pred_df.merge(gt_df,how='outer',on=['document',"token"],suffixes=('_pred','_gt'))

    df['cm'] = ""

    df.loc[df.label_gt.isna(),'cm'] = "FP"
    df.loc[df.label_pred.isna(),'cm'] = "FN"
    df.loc[(df.label_gt.notna()) & (df.label_gt!=df.label_pred),'cm'] = "FN"

    df.loc[(df.label_pred.notna()) & (df.label_gt.notna()) & (df.label_gt==df.label_pred),'cm'] = "TP"
    
    FP = (df['cm']=="FP").sum()
    FN = (df['cm']=="FN").sum()
    TP = (df['cm']=="TP").sum()
    
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    f1 = precision * recall / (precision + recall)
    
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1-Score: " + str(f1))

    s_micro = (1+(beta**2))*TP/(((1+(beta**2))*TP) + ((beta**2)*FN) + FP)

    return s_micro

In [22]:
# Modeling
x = pd.DataFrame(preprocessed_train)
y = x['labels']
x = x.drop(columns='labels')
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.25, random_state=0)

In [23]:
ALLOW_LIST = []
DENY_LIST_EMAIL = []
DENY_LIST_ADDRESS = []
DENY_LIST_URL = []
DENY_LIST_NAME = []
DENY_LIST_PHONE = []
DENY_LIST_ID = []

In [24]:
all_stopwords = list(stopwords.words())
words = Counter()
for doc in preprocessed_train.tokens:
    words.update(doc)
#for doc in preprocessed_test.tokens:
#    words.update(doc)
all_stopwords  += [str(w).lower() for w, i in words.items() if i > 55]
all_stopwords = list(sorted(set(all_stopwords)))
del words

ALLOW_LIST.extend(all_stopwords)

In [25]:
PHONE_ALLOW_LIST = ['phone', 'number', 'telephone', 'cell', 'cellphone',
              'mobile', 'call', 'ph', 'tel', 'mobile', 'Email']
URL_DENY_LIST = ["wikipedia", "coursera", ".pdf", ".PDF", "article",
             ".png",".gov", ".work", ".ai", ".firm", ".arts",
             ".store", ".rec", ".biz", ".travel", '.ru', 'designabetterbusiness', '.tools', 'designorate',
                       'designresearchtechniques', 'ec', '.europa', 'forbes', 'google',
                       'ideas', 'trello', '.edu']

In [26]:
#Prepping the list of datasets for PII
DENY_LIST_NAME.extend(us_names)
DENY_LIST_NAME.extend(nltk_female)
DENY_LIST_NAME.extend(nltk_male)
DENY_LIST_NAME.extend(french_dept)
DENY_LIST_NAME.extend(french_nat)

tokens = train_x['tokens'].apply(pd.Series).stack().reset_index(drop=True).tolist()
labels = train_y.apply(pd.Series).stack().reset_index(drop=True).tolist()

#tokens = train['tokens'].apply(pd.Series).stack().reset_index(drop=True).tolist()
#labels = train['labels'].apply(pd.Series).stack().reset_index(drop=True).tolist()

for i in set(labels):
    indices = [j for j in range(len(labels)) if labels[j] == i]
    if i == 'O':
        ALLOW_LIST.extend([tokens[i] for i in indices])
    if i == 'B-EMAIL':
        DENY_LIST_EMAIL.extend([tokens[i] for i in indices])
    elif i in ['B-STREET_ADDRESS', 'I-STREET_ADDRESS']:
        DENY_LIST_ADDRESS.extend([tokens[i] for i in indices])
    elif i in ['B-URL_PERSONAL', 'I-URL_PERSONAL']:
        DENY_LIST_URL.extend([tokens[i] for i in indices])
    elif i in ['B-NAME_STUDENT', 'I-NAME_STUDENT']:
    #elif i in ['I-NAME_STUDENT']:
        DENY_LIST_NAME.extend([tokens[i] for i in indices])
    elif i in ['B-PHONE_NUM', 'I-PHONE_NUM']:
        DENY_LIST_PHONE.extend([tokens[i] for i in indices])
    elif i in ['B-ID_NUM', 'I-ID_NUM']:
        DENY_LIST_ID.extend([tokens[i] for i in indices])
    else:
        continue


In [27]:
id_regex = r'([A-Za-z]{2}[.?]:)?\d{12,12}'
id_pattern = Pattern(name="id", regex=id_regex, score = 0.5)
id_recognizer = PatternRecognizer(supported_entity="ID_CUSTOM", patterns = [id_pattern])

address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(cir(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
address_pattern = Pattern(name="address", regex=address_regex, score=0.5)
address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM", patterns = [address_pattern], context=["st", "Apt"])

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_pattern = Pattern(name="email address", regex=email_regex, score=0.5)
email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM", patterns = [email_pattern])

url_regex = r'((https?)|(http?)|(ftp?))://\S+|www\.\S+'
url_pattern = Pattern(name="url", regex=url_regex, score=0.5)
url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM", patterns = [url_pattern])

phone_regex = r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$'
phone_pattern = Pattern(name='phone', regex=phone_regex, score=0.5)
phone_recognizer = PatternRecognizer(supported_entity='PHONE_CUSTOM', patterns=[phone_pattern])

In [28]:
class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts: NlpArtifacts) -> list[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [29]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

In [30]:
dictionary = RecognizerRegistry()
dictionary.load_predefined_recognizers()
dictionary.add_recognizer(address_recognizer)
dictionary.add_recognizer(email_recognizer)
dictionary.add_recognizer(url_recognizer)
dictionary.add_recognizer(phone_recognizer)
dictionary.add_recognizer(id_recognizer)

In [31]:
analyzer = AnalyzerEngine(supported_languages=['en'],
                          registry=dictionary,
                          nlp_engine=nlp_engine,
                          context_aware_enhancer=LemmaContextAwareEnhancer(
                              context_similarity_factor=0.6,
                              min_score_with_context_similarity=0.4
                          ))

In [32]:
# Training/Testing
preds = []
#test = preprocessed_test
test = val_x
temp = test.apply(lambda x: token_index(x), axis=1)
test['start'] = temp.apply(lambda x: x[0])
test['end'] = temp.apply(lambda x: x[1])

In [33]:
for i, d in enumerate(tqdm(test.iterrows())):
    results = analyzer.analyze(text=d[1]['full_text'],
                               entities=["PHONE_CUSTOM", "PERSON", "URL_CUSTOM", "EMAIL_ADDRESS",
                                         "EMAIL_CUSTOM", "ADDRESS_CUSTOM", "US_SSN", "US_ITIN",
                                         "US_PASSPORT", "US_BANK_NUMBER", "USERNAME", "ID_CUSTOM"],
                               allow_list=ALLOW_LIST,
                               language='en', 
                               score_threshold=0.005)
    pre_preds = []
    for r in results:
        s = find_larger(d[1]['start'], r.start)
        end = r.end
        word = d[1]['full_text'][r.start:r.end]
        end = end - count_whitespaces(word)
        temp_preds = [s]
        try:
            while d[1]['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        
        tmp = False
        
        if r.entity_type == 'PHONE_CUSTOM':
            if date_check(word):
                continue
            for w in PHONE_ALLOW_LIST:
                if w in d[1]['full_text'][max(r.start-50, 0):min(r.end+50, len(d[1]['full_text']))]:
                    tmp = False
                    break
                else:
                    tmp = True 
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            if str(i).upper() in wikipedia:
                tmp = True
                break
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL_CUSTOM':
            for w in URL_DENY_LIST:
                if w in word:
                    tmp = True
                    break
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
            label = "EMAIL"
        if r.entity_type == 'ADDRESS_CUSTOM':
            label = 'STREET_ADDRESS'
        if r.entity_type in ['US_SSN', 'US_ITIN', 'US_PASSPORT', 'US_BANK_NUMBER', 'ID_CUSTOM']:
            label = 'ID_NUM'
        if r.entity_type == 'USERNAME':
            label =  'USERNAME'
        if tmp:
            continue
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
                    "document":d[1]['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

0it [00:00, ?it/s]1702it [03:18,  8.56it/s]


In [34]:
predicted_results = pd.DataFrame(preds).iloc[:,:-1].reset_index()
predicted_results.columns = ['row_id','document', 'token', 'label']

temp = val_x[['document']].join(val_y)
dictionary = temp['labels'].apply(lambda x: {'indx': list(range(len(x))), 'vals': x})
indices = dictionary.apply(lambda x: x['indx']).explode()
values = dictionary.apply(lambda x: x['vals']).explode()

ground_truth = pd.concat([indices, values], axis=1).reset_index()
ground_truth['document'] = ground_truth['index'].apply(lambda x: temp['document'][x])
ground_truth = ground_truth.drop(columns='index')
ground_truth.columns = ['token', 'label', 'document']
ground_truth = ground_truth[ground_truth['label'] != 'O']
ground_truth = ground_truth.reset_index(names=['row_id'])


In [37]:
print("FBeta Score: " + str(pii_fbeta_score(predicted_results, ground_truth, 5)))

Precision: 0.27905004240882103
Recall: 0.8255959849435383
F1-Score 0.20855784469096672
FBeta Score: 0.7677601759188619
