**Please also type in the following command line/terminal installations**:

* `brew install libomp` 
* `pip install xgboost`
* `pip install plotly`


**For Interactive Visualizations for Jupyter Notebooks:**
* `pip install "notebook>=5.3" "ipywidgets>=7.2"`

**For Interactive Visualizations for Jupyter Labs:**
* Install node @ https://nodejs.org/en/, then in command line/terminal:
* `pip install jupyterlab "ipywidgets>=7.5"`
* `jupyter labextension install jupyterlab-plotly@4.8.2`

Source: https://plotly.com/python/getting-started/?utm_source=mailchimp-jan-2015&utm_medium=email&utm_campaign=generalemail-jan2015&utm_term=bubble-chart

**For sklearn multiclass and spaCy NLP:**
* `pip install scikit-multilearn`
* `pip install -U spacy`
* `python -m spacy download en_core_web_sm`

In [1]:
#usual imports for sklearn modelling 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score

#for multilabel classification and oversampling
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import SMOTE

#for word vectorization
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

#advanced supervised classification models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
from xgboost import XGBClassifier

#classification metrics imports 
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, hinge_loss
from sklearn.metrics import f1_score
from sklearn import metrics

In [2]:
#We will then be importing these libraries: 

from plotly import __version__
import plotly.offline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px

print(__version__)
#learnt this from 
#https://towardsdatascience.com/interactive-distribution-plots-with-plotly-ea58efc78885

import cufflinks as cf
#we have to make sure this code is run for Jupyter Notebooks otherwise the plots may not happen:

init_notebook_mode(connected=True)

#we have to make sure this code is run otherwise the visualizations won't run offline:
cf.go_offline()

4.8.1


In [3]:
#pre-process text for EDA and later modelling too using spacy and string libraries

import spacy
from spacy.lang.en import English # updated

import re
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy
import spacy.cli
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
from spacy.tokens import Doc
from spacy import displacy

from collections import Counter

In [4]:
df = pd.read_csv("../data/df_clean_draft_2.csv", index_col=0)

## Text Pre-Processing for Bag of Words Models using `sklearn`

In [5]:
#from https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/
#stopwords removal

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    
    #remove non-words
    tokens = re.sub("[^a-zA-Z]", " ", sentence)
    tokens = re.sub("[0-9]+", "", tokens)
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = parser(tokens)

    # Lemmatizing each token and converting each token into lowercase
    tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens ]

    # Removing stop words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations]
    
    # return preprocessed list of tokens
    return(" ".join(tokens))

In [6]:
df["bag_cleaned_text"]=df["clause_text"].apply(spacy_tokenizer)

In [7]:
# df = pd.get_dummies(df, columns=["clause_type"])

In [8]:
#we drop the "other clauses" column because if automatic renewals = 0 and renewal options = 0, 
#by definition, it's another kind of clause since the 3 columns are mutually exclusive
# df.drop(['clause_type_other_clauses'], axis=1, inplace = True)

In [9]:
# df.rename(columns={"clause_type_renewal_option_clause": "renewal_option", "clause_type_automatic_renewal_clause": "automatic_renewal", "clause_type_other_clauses": "other_clauses"}, inplace=True)

In [10]:
df.head()

Unnamed: 0,clause_text,clause_type,clause_word_count,bag_cleaned_text
0,Upon the expiration of the original term or an...,automatic_renewal_clause,81,expiration original term renewal term employme...
1,This Agreement shall be automatically extended...,automatic_renewal_clause,49,agreement shall automatically extended additio...
2,"This Agreement shall renew automatically, with...",automatic_renewal_clause,149,agreement shall renew automatically respect se...
3,If a Holder of such Security has not delivered...,automatic_renewal_clause,433,holder security delivered repayment election r...
4,This Agreement shall be renewed automatically ...,automatic_renewal_clause,50,agreement shall renewed automatically succeedi...


In [11]:
df["clause_type"].replace({"automatic_renewal_clause": 0, "renewal_option_clause": 1, "other_clauses":2}, inplace=True)

In [12]:
df.head()

Unnamed: 0,clause_text,clause_type,clause_word_count,bag_cleaned_text
0,Upon the expiration of the original term or an...,0,81,expiration original term renewal term employme...
1,This Agreement shall be automatically extended...,0,49,agreement shall automatically extended additio...
2,"This Agreement shall renew automatically, with...",0,149,agreement shall renew automatically respect se...
3,If a Holder of such Security has not delivered...,0,433,holder security delivered repayment election r...
4,This Agreement shall be renewed automatically ...,0,50,agreement shall renewed automatically succeedi...


In [13]:
#backing up our modified dataframe again
df.to_csv("../data/df_clean_draft_3.csv")

In [14]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))

In [15]:
X = df["bag_cleaned_text"]
X = vectorizer.fit_transform(X)

y = df["clause_type"]

In [16]:
# df1 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
# df1

In [17]:
oversample = SMOTE(sampling_strategy="auto", random_state=42)
X_rs, y_rs = oversample.fit_resample(X, y)

In [18]:
X_rs

<27042x205953 sparse matrix of type '<class 'numpy.float64'>'
	with 2813418 stored elements in Compressed Sparse Row format>

In [19]:
y_rs_df=pd.DataFrame(y_rs)

y_rs_df

Unnamed: 0,clause_type
0,0
1,0
2,0
3,0
4,0
...,...
27037,1
27038,1
27039,1
27040,1


In [20]:
y_rs_df.value_counts()

clause_type
2              9014
1              9014
0              9014
dtype: int64

In [21]:
y_rs_df.replace({0:"automatic_renewal_clause",1:"renewal_option_clause",2:"other_clauses"}, inplace=True)

In [22]:
y_rs_df

Unnamed: 0,clause_type
0,automatic_renewal_clause
1,automatic_renewal_clause
2,automatic_renewal_clause
3,automatic_renewal_clause
4,automatic_renewal_clause
...,...
27037,renewal_option_clause
27038,renewal_option_clause
27039,renewal_option_clause
27040,renewal_option_clause


In [23]:
y_again=pd.get_dummies(y_rs_df)

In [24]:
y_again.rename(columns={"clause_type_renewal_option_clause": "renewal_option", "clause_type_automatic_renewal_clause": "automatic_renewal", "clause_type_other_clauses": "other_clauses"}, inplace=True)

In [25]:
# y_again.drop(['clause_type_other_clauses'], axis=1, inplace = True)

In [26]:
y_again.isnull().sum()

automatic_renewal    0
other_clauses        0
renewal_option       0
dtype: int64

In [27]:
y_again

Unnamed: 0,automatic_renewal,other_clauses,renewal_option
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
27037,0,0,1
27038,0,0,1
27039,0,0,1
27040,0,0,1


In [28]:
# X_resampled_df = pd.DataFrame(X_rs, columns=X.columns)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_rs, y_again, test_size=0.2, random_state=42)

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

## Why we don't apply Standard Scaler in Text-based Classification

https://datascience.stackexchange.com/questions/33730/should-i-rescale-tfidf-features

In [31]:
# model_to_set = OneVsRestClassifier(SVC(class_weight = "balanced"))

# parameters = {"estimator__C": [1,2,4,8],
#               "estimator__kernel": ["poly","rbf"],
#               "estimator__degree":[1, 2, 3, 4],}

# model_tunning = GridSearchCV(model_to_set, param_grid=parameters, cv=3, n_jobs=-1, verbose=1, 
#                              scoring = "roc_auc")

# model_tunning.fit(X_train, y_train)

# print (model_tunning.best_score_)
# print (model_tunning.best_params_)

In [32]:
def grid_modeller_val_scorer(classifier): #takes arguments "knn", "rf", "gb", "xgb" "svc"
    
    #all 5 classifier models correspond respectively to these 5 instantiated models:
    clf_rf  = OneVsRestClassifier(RandomForestClassifier())
    clf_gb = OneVsRestClassifier(GradientBoostingClassifier())
    clf_svc = OneVsRestClassifier(SVC(class_weight="balanced")) #i did this to account for the uneven classes
    clf_xgb = OneVsRestClassifier(XGBClassifier())
    
    if classifier == "rf":
        
        #run grid search on Random Forest params:
        param_grid_rf  = {'estimator__n_estimators': [10, 50, 100, 250, 500],
                          'estimator__min_samples_leaf': [1, 3, 5],
                          'estimator__max_features': ['sqrt', 'log2'],} 
    
        gs = GridSearchCV(clf_rf, param_grid_rf, cv=3, n_jobs=-1, verbose=1,
                          scoring = "roc_auc") 
        gs.fit(X_train, y_train)
                            
    elif classifier == "gb":
        #run grid search on Gradient Boosting Params
        param_grid_gb = {'estimator__max_depth': [2,3,4,5],
                         'estimator__n_estimators': [100, 125, 150, 200],
                         'estimator__learning_rate': [.08, .1, .12]}
    
        gs = GridSearchCV(clf_gb, param_grid_gb, cv=3, n_jobs=-1, verbose=1,
                          scoring = "roc_auc") 
        gs.fit(X_train, y_train)
    
    elif classifier == "xgb":
        #create Gradient Boosting pipeline:
        pipe_xgb = Pipeline([('clf_xgb',clf_xgb),])
        
        #run grid search on Xtreme Gradient Boosting Params
        param_grid_xgb = {'estimator__max_depth': [2,3,4,5,6,7,8,9,10],
                          'estimator__n_estimators': [100, 125, 150, 200, 250],
                          'estimator__learning_rate': [.1, .01, .05],
                         }
                        
        gs = GridSearchCV(pipe_xgb, param_grid_xgb, cv=3, n_jobs=-1, verbose=1,
                          scoring = "roc_auc") 
        gs.fit(X_train, y_train)
                            
    elif classifier == "svc":
        #create SVM pipeline for classification and scaling
        pipe_svm = Pipeline([("clf_svc", clf_svc), ])
        
        #run grid search on SVC paramaters
        param_grid_svm = {"estimator__C":[1,10], 
                          "estimator__gamma":[0.001, 0.01, 0.1, 1], 
                          "estimator__kernel":('linear', 'rbf','sigmoid','poly')}  
        
        gs = GridSearchCV(pipe_svm, param_grid_svm, cv=3, n_jobs=-1, verbose=1,
                          scoring = "roc_auc") 
        gs.fit(X_train, y_train)
    
    #get scores
    train_score = gs.score(X_train, y_train)
    val_score = gs.score(X_val, y_val)    
    y_pred = gs.predict(X_val)
        
    #get ROC AUC and F1 Score
    auc = roc_auc_score(y_val, y_pred, average = "micro")
    f1_score = metrics.f1_score(y_val, y_pred, average = "micro")
    
    #sklearn's advice: "micro" gives each sample-class pair 
    #an equal contribution to the overall metric. 
    #Rather than summing the metric per class, this sums the dividends and divisors that make up the per-class metrics 
    #to calculate an overall quotient. 
    #Micro-averaging may be preferred in multilabel settings, including multiclass classification where a majority class is to be ignored.
    
    #get multiclass metrics 
    hinge_loss = metrics.hinge_loss(y_val, y_pred)
    balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred)
    
    metrics_list= [train_score, val_score, gs.best_score_, auc, f1_score, hinge_loss, balanced_accuracy]

    
    #print out accuracy, estimator and parameters from GridSearchCV
    print(f'best train accuracy score = {train_score}')
    print(f'best validation accuracy score = {val_score}')
    print(f'best grid search score = {gs.best_score_}')
    print(f'ROC AUC score = {auc}')
    print(f'f1_score={f1_score}')
    print(f'best estimator = {gs.best_estimator_}')
    print(f'best parameters = {gs.best_params_}')
    print(f"train_score, val_score, grid best score, auc, f1, hinge loss, balanced accuracy for {classifier}:", metrics_list)

In [33]:
grid_modeller_val_scorer("rf")

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.1min

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 18.6min finished


ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted', 'samples'].

In [None]:
grid_modeller_val_scorer("svc")

In [None]:
grid_modeller_val_scorer("gb")

In [None]:
grid_modeller_val_scorer("xgb")

### Recombining Initial Train and Validation Sets

In [None]:
# #we're now going to retrain our model on the full training set 

# X_train, X_test, y_train, y_test = train_test_split(X_rs, y_again, test_size=0.2, random_state=42)

In [None]:
# #firstly, we run the Robust Scaler that worked out well in our GridSearch

# rs = RobustScaler()
# rs.fit(X_train)
# X_train = rs.transform(X_train)
# X_test = rs.transform(X_test)