In [1]:
#Usual Imports 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

You will need to install the following in your command line/terminal:

`pip install -U spacy`

`python -m spacy download en_core_web_sm`

In [36]:
from __future__ import unicode_literals, print_function
from spacy.lang.en import English # updated

import spacy

In [3]:
#plotly, squarify and cufflinks interactive chart imports

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
import plotly.express as px #another related library of plotly that makes it easy to draw multiple plots
import squarify

#we have to make sure this code is run for Jupyter Notebooks otherwise the plots may not happen offline:
init_notebook_mode(connected=True)
cf.go_offline()

In [4]:
#set column size to be larger
pd.set_option("display.max_colwidth", 1000)

In [5]:
#for loop to open up software agreement folder and turn it into DataFrame
import os

file_names = os.listdir('../data/lexpredict-contraxsuite-samples-master/agreements/software_license/')
# Create Dictionary for File Name and Text
file_name_and_text = {}
for file in file_names:
    with open('../data/lexpredict-contraxsuite-samples-master/agreements/software_license/' + file, "r") as target_file:
         file_name_and_text[file] = target_file.read()
df = (pd.DataFrame.from_dict(file_name_and_text, orient='index')
             .reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))

In [6]:
#checking out what our dataframe looks like
df.head(3)

Unnamed: 0,file_name,text
0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"\n\n\n\n\n\n\n\n\n\nExhibit 10.4\n\n\nEXCLUSIVE SOFTWARE LICENSE AGREEMENT\n\nThis Agreement is made as of March 31, 1996, by and between\nCARE CORPORATION LIMITED, a company incorporated in the British\nVirgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a\nDelaware corporation (""WARNER""), and, for the limited purposes of\njoining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL\nSYSTEMS, INC., a Delaware corporation and wholly owned subsidiary\nof WARNER (""COVER-ALL"").\n\nRECITALS\n\nA. CCL is the exclusive worldwide owner, except in the\nCommonwealth of Australia, the Dominion of New Zealand, and the\nUnited States of America, of all rights in certain computer\nsoftware and related documentation pertaining to the\nadministration of worker's compensation, as set forth and\ndescribed in Attachment A hereto (hereinafter referred to as the\n""CARE Software"").\n\nB. Pursuant to a separate agreement by and between CCL and\nWARNER, entered into contemporaneously with this Agr..."
1,946822_1999-12-01_EXHIBIT 10.26 - SOFTWARE LIC AGREE -RURAL CELLULAR.txt,"\n\n\n\n\n\n\n\n\n\n\nEXHIBIT 10.26\n\n\nSOFTWARE LICENSE AGREEMENT\n\nThis Software License Agreement is made as of this 25th day of\nSeptember, 1999, between Preferred Voice, Inc., a Delaware corporation\n(""Licensor"") and Rural Cellular Corporation, a Minnesota corporation, on behalf\nof itself and its wholly owned subsidiaries and affiliates (""Licensee"").\nLicensor and Licensee are collectively referred to in this Agreement as the\n""Parties.""\n\nBackground Information\n\nLicensor has developed a system (the ""System"") that when interconnected\nwith a telecommunications switching system is capable of performing the services\n(the ""Services"") described in a Marketing Agreement between Licensor and\nLicensee of even date (the ""Marketing Agreement""). Each System consists of the\nhardware, certain third party software (the ""Third Party Software"") and certain\nproprietary application software developed by Licensor (the ""Application\nS..."
2,"1414043_2016-10-14_SOFTWARE LICENSE AGREEMENT DATED OCTOBER 7, 2016 BY AND BETWEEN THE COMPANY AND HANG WITH, INC..txt","\n\n\n\n\n\n\nBlueprint\n\n\n\n\n\n\n\n\nEXHIBIT\n10.5\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSOFTWARE LICENSE AGREEMENT\n\n\n\n\n\n\nThis SOFTWARE LICENSE AGREEMENT (this ""Agreement"") is effective as\nof October 7, 2016 (the\n""Effective Date"") by and between Friendable, Inc., a Nevada corporation with offices at 1821 S\nBascom Ave., Suite 353, Campbell, California 95008\n(""FRIEND""), and Hang With,\nInc., a Nevada corporation with\nits principal office at 7 Studebaker, 1st\nFloor, Irvine, CA 92618\n(""LICENSOR"") (FRIEND and LICENSOR each, a ""Party""; together,\nthe ""Parties"").\n\n\n\n\n\n\nRECITALS\n\n\n\n\n\nWHEREAS, LICENSOR is the sole owner of all rights,\ntitle, and interest in and to the proprietary live-streaming mobile\nand social application platform generally known as Hang w/,\ndescribed in further detail in Exhibit A\nattached hereto and incorporated\nherein by this reference (the ""App""), as well as the ""Licensed\nSoftware"" (defined below), which includes any and all related\ndocume..."


In [7]:
#since these are all contract licenses lets create a new column to name them as such for future contract meta-classification

df["contract_type"]="software_license"

In [8]:
#backing up .csv file

df.to_csv("../data/software_agmts_raw.csv")

In [9]:
#let's get rid of all the unnecessary new line breaks like \n and extra spaces

def space_remover(mess):
    
    splitted = mess.split()
    
    return (" ".join(splitted))

In [10]:
#applying our simple cleaning function

df["text"] = df["text"].apply(space_remover)

In [11]:
#the text definitely looks better.

df.head(2)

Unnamed: 0,file_name,text,contract_type
0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license
1,946822_1999-12-01_EXHIBIT 10.26 - SOFTWARE LIC AGREE -RURAL CELLULAR.txt,"EXHIBIT 10.26 SOFTWARE LICENSE AGREEMENT This Software License Agreement is made as of this 25th day of September, 1999, between Preferred Voice, Inc., a Delaware corporation (""Licensor"") and Rural Cellular Corporation, a Minnesota corporation, on behalf of itself and its wholly owned subsidiaries and affiliates (""Licensee""). Licensor and Licensee are collectively referred to in this Agreement as the ""Parties."" Background Information Licensor has developed a system (the ""System"") that when interconnected with a telecommunications switching system is capable of performing the services (the ""Services"") described in a Marketing Agreement between Licensor and Licensee of even date (the ""Marketing Agreement""). Each System consists of the hardware, certain third party software (the ""Third Party Software"") and certain proprietary application software developed by Licensor (the ""Application Software""). Licensee is a wireless carrier that is currently providing telecommunications service in...",software_license


In [12]:
#using spaCy to split the text into sentences for our classification work 

#we first begin by instantiating spaCy's inbuilt sentence tokenizer
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

In [13]:
#we then create another function to deconstruct large strings of contracts into a list of sentence tokens.

def sentences_lister(text):
    doc = nlp(text)
    return [sent.string.strip() for sent in doc.sents]

In [14]:
#applying our sentences lister function into a new panda columns
df['sentences_lists'] = df['text'].apply(sentences_lister)

In [15]:
#we want to explode our list of sentences into new rows which is just what we do now:

df = df.explode('sentences_lists')

In [16]:
#how big is this new dataframe?
df.shape

(17000, 4)

Our goal now is to pull out all sentences with the word "renewal" in them. These are potential "renewal" clauses.

We want to be able to identify the general renewal clauses AND the automatic renewal clauses to help contract managers and GCs know which of their customer contracts have to be manually re-signed to renew the software license, and which don't.

In [17]:
df.head(2)

Unnamed: 0,file_name,text,contract_type,sentences_lists
0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC.,"
0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license,"a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC.,"


In [18]:
#all the index numbers are cloned, which isn't ideal for indexing 
#so we reset the dataframe index

df.reset_index(inplace = True)

In [19]:
#backing up our exploded sentence split dataframe
df.to_csv("../data/software_agmts_sentencized.csv")

I'm now going to go through each of the 91 potential renewal clauses manually to annotate those that are actually renewal clauses, based on my law school education.

In [20]:
potential_renewal_clauses = df[df['sentences_lists'].str.contains("renew")]

In [21]:
potential_renewal_clauses.shape

(91, 5)

In [22]:
#going through index of all 91 rows to see what's a manual or automatic renewal clause
potential_renewal_clauses[["file_name","sentences_lists"]][81:90]

Unnamed: 0,file_name,sentences_lists
15820,922285_2001-08-14_SOFTWARE LICENSING AGREEMENT.txt,"Thereafter, this Agreement may be renewed for successive terms of 12 months (Renewal terms) each unless otherwise terminated as provided herein."
15904,948708_1996-11-14_AMENDED AND RESTATED SOFTWARE LICENSING AGREEMENT.txt,"SMSI shall have exclusive rights to duplicate floppy diskettes for one year for the Licensed Products in Appendix A. The term of this Agreement shall be one (1) year from the date hereof, and shall renew automatically for additional one year periods unless either party gives the other party written notice of cancellation at least sixty (60) days prior to the end of the then current term."
16417,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,"b) Support Agreement Renewals: Cisco shall be responsible for generating all renewal sales and shall pay Licensor [***] of net revenue for each Support Agreement Renewal, provided that Licensor submits a report to Cisco within forty-five (45) days after each Cisco fiscal quarter end, detailing a list of Customers and Cisco support contracts expiring in the following quarter."
16420,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,"3 <PAGE> Support Agreement number, term of support, effective date of support, the Software identification numbers being renewed, and the total list price."
16421,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,Cisco shall thereafter submit a quarterly renewal report to Licensor in accordance with subparagraph (e) below.
16422,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,Licensor's obligation to submit support renewal reports to Cisco is contingent upon Cisco providing report information identified in subparagraph (f) below to Licensor for Licensor to complete support renewal reports for otherwise new support contracts.
16423,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,Licensor's obligations to submit renewal reports shall cease when Cisco has implemented an automated ability to track Support Agreement renewals and has provided Licensor thirty (30) days' advance written notice of such automation. (
16424,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,"c) [***] (d) Shared Support: Although Cisco will endeavor to discourage Support Agreement sales and Renewals to Customers by a Cisco service integrator through Cisco's SIS'98 or other shared support programs (""Shared Support"") when such sale or renewals are identified and accepted, the same percentage compensation noted in subparagraph (b) above shall be paid to Licensor. ("
16460,1000495_2003-03-26_AMENDMENT TO OEM-IN SOFTWARE LICENSE AGREEMENT.txt,"6 <PAGE> ""Support Agreement"" means a support contract between Cisco and a Customer for the provision by Cisco of support and maintenance for the Software and shall include contracts for the initial period of support, renewal of such support, and reinstatement of expired support. """


After eyeballing each of the 91 clauses manually, 11 clauses are manual renewals, 29 are automatic and the rest are neither.

In [23]:
manual_renewal_row_list=[223, 3033, 4467, 4837, 6933, 7835, 8860, 9087, 9483, 12081, 14991]

In [24]:
automatic_renewal_row_list=[925, 1058, 1103,1451,1763,1888,1890, 
                            2179,2816, 3261, 3521, 4929, 5471, 5860, 6188, 
                            7156, 7592, 7836, 8095, 9264, 9285, 9306, 11105, 12557, 13701,
                            14121, 15820, 15904]

In [25]:
#we create 3 new columns with placeholder values to describe the nature of the clause.
df["automatic_renewal"] = 0

In [26]:
df["manual_renewal"] = 0

In [27]:
df["other_clause"] = 1

In [28]:
#we write a for loop that will annotate our 2 kinds of renewal clauses accordingly. 

for i in manual_renewal_row_list:
    df.at[i, "manual_renewal"] = 1
    df.at[i, "other_clause"] = 0

In [29]:
#looks okay
df.iloc[223]

index                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [30]:
#for loop for automatic renewal clause annotation
for i in automatic_renewal_row_list:
    df.at[i, "automatic_renewal"] = 1
    df.at[i, "other_clause"] = 0

In [31]:
#checking if annotation worked - it did
df.iloc[7836]

index                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [32]:
#checking out our cleaned dataframe:

df.head(2)

Unnamed: 0,index,file_name,text,contract_type,sentences_lists,automatic_renewal,manual_renewal,other_clause
0,0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC.,",0,0,1
1,0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license,"a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC.,",0,0,1


## Pre-Processing

In [33]:
#pre-process text for EDA and later modelling too using spacy and string libraries

import re
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [34]:
#there are some inconsistencies with importing the English library for spacy in Jupyter. 
#importing this other library and redownloading helps fix that:
import sys
!{sys.executable} -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/grahamlim/opt/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
/Users/grahamlim/opt/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


We’ll create a `spacy_tokenizer()` function that accepts a sentence as input and processes the sentence into tokens, performing lemmatization, lowercasing, and removing stop words. 

In [37]:
#from https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/
#stopwords removal

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    # return preprocessed list of tokens
    return mytokens

In [38]:
#remove numbers
def number_punctuation_remover(mess):
    no_numbers = re.sub('[0-9]+', '', mess)
    cleaned = re.sub("[^a-zA-Z]", " ", no_numbers)
    return cleaned

In [39]:
df["numberless_sentence"] = df["sentences_lists"].apply(number_punctuation_remover)

In [40]:
df.head(1)

Unnamed: 0,index,file_name,text,contract_type,sentences_lists,automatic_renewal,manual_renewal,other_clause,numberless_sentence
0,0,737300_1996-04-08_EXHIBIT 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT.txt,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC., a Delaware corporation (""WARNER""), and, for the limited purposes of joining in Sections 4, 6, 8, 10, 11, 21, and 22, COVER-ALL SYSTEMS, INC., a Delaware corporation and wholly owned subsidiary of WARNER (""COVER-ALL""). RECITALS A. CCL is the exclusive worldwide owner, except in the Commonwealth of Australia, the Dominion of New Zealand, and the United States of America, of all rights in certain computer software and related documentation pertaining to the administration of worker's compensation, as set forth and described in Attachment A hereto (hereinafter referred to as the ""CARE Software""). B. Pursuant to a separate agreement by and between CCL and WARNER, entered into contemporaneously with this Agreement (the ""Stock Purchase Agreement""), CCL and WARNE...",software_license,"Exhibit 10.4 EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March 31, 1996, by and between CARE CORPORATION LIMITED, a company incorporated in the British Virgin Islands (""CCL""), WARNER INSURANCE SERVICES, INC.,",0,0,1,Exhibit EXCLUSIVE SOFTWARE LICENSE AGREEMENT This Agreement is made as of March by and between CARE CORPORATION LIMITED a company incorporated in the British Virgin Islands CCL WARNER INSURANCE SERVICES INC


# EDA

In [41]:

#import the TFIDF vectorizer,

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import scipy.sparse as sp



In [42]:
#intialize it and assign as vectorizer 
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,2))

In [43]:
all_clauses = df["numberless_sentence"]
auto_renewals_only = df["numberless_sentence"][df["automatic_renewal"]==1]
manual_renewals_only = df["numberless_sentence"][df["manual_renewal"]==1]

In [44]:
#turn a series into vectors and then plot the distribution of the top 25 words

def vector_barplotter(series, graph_title, x_label):
    vectorized_series = vectorizer.fit(series)
    series_df = pd.DataFrame(vectorizer.transform(series).todense(),
                              columns=vectorizer.get_feature_names())
    #we slice the top 25 words 
    top_25_words=series_df.sum().sort_values(ascending = False).head(25)
    
    #we plot this on a plotly bar chart by frequency of words.
    fig = px.bar(top_25_words, orientation='v', title = graph_title, 
             labels={"index":x_label})
    fig.show()

In [45]:

#to plot a square word treemap, I have to modify our word array into a special dataframe; 
#Adapted from https://python-graph-gallery.com/200-basic-treemap-with-python/
#We plot a square treemap of word frequencies, as an alternative to word clouds
#to save time I turned this into a function too.

def vector_treemap_plotter(series):
    vectorized_series = vectorizer.fit(series)
    series_df = pd.DataFrame(vectorizer.transform(series).todense(),
                              columns=vectorizer.get_feature_names())
    #we slice the top 25 words and turn that into a squarify-friendly dataframe
    top_25_words=series_df.sum().sort_values(ascending = False).head(25)
    top_25_words_df = pd.DataFrame(top_25_words)
    top_25_words_df.columns = ["count"]
    top_25_words_df["word"]=top_25_words_df.index
    top_25_words_df = top_25_words_df.reset_index(drop=True)
    
    count = top_25_words_df["count"]
    word = top_25_words_df["word"]
    
    #I then create a hybrid heatmap and treemap
    #it is like a word cloud, but it looks cleaner.
    #frequent words are deeper blues with bigger squares.

    cmap = matplotlib.cm.Blues
    mini=min(count)
    maxi=max(count)
    norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
    colors = [cmap(norm(value)) for value in count]

    squarify.plot(sizes=count, norm_x=100, norm_y=100, label=word, 
                  color=colors,alpha=.9 )

    plt.axis('off')
    plt.show()

In [46]:
vector_barplotter(all_clauses, "Top Words in All Clauses", "Word")


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [47]:
vector_barplotter(auto_renewals_only, "Top Words in Automatic Renewal Clauses", "Word")


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [48]:
vector_barplotter(manual_renewals_only, "Top Words in Manual Renewal Clauses", "Word")


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [49]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm, linear_model, metrics
from sklearn.metrics import confusion_matrix, classification_report

please run `pip install imblearn` in your command line/terminal interface

In [51]:
#applying SMOTE because of imbalanced classes:
from sklearn.preprocessing import RobustScaler

In [52]:
from imblearn.over_sampling import SMOTE

In [61]:
def grid_modeller_scorer(predictor, predictee, classifier):
    X = df[predictor]
    y = df[predictee]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
    
    if classifier == 1:
        #create Multinomial Bayes pipeline using Count Vectorizer and TFIDF Vectorizer
        pipeline_bayes = Pipeline([("wordbag",TfidfVectorizer()),
                                    ("classifier",MultinomialNB())])
        #run grid search on TFIDF Vectorizer parameters
        param_grid = {'wordbag__ngram_range':[(1,1),(1,2),(2,2)],'wordbag__max_features': [25, 50, 100,1000,10000]}
        gs = GridSearchCV(pipeline_bayes, param_grid, cv=3, n_jobs=-1, verbose=1) 
        gs.fit(X_train, y_train)
        
        #get scores
        gb_train_score = gs.score(X_train, y_train)
        gb_test_score = gs.score(X_test, y_test)
        
        #get confusion matrix and classification metrics
        y_pred = gs.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, columns=["pred_automatic_renewal","pred_other_clause"], index=["actual_automatic","actual_other_clause"])
        
        #print out accuracy, estimator and parameters from GridSearchCV
        print(f'grid bayes best train score = {gb_train_score}')
        print(f'grid bayes best test score = {gb_test_score}')
        print(f'grid bayes best estimator = {gs.best_estimator_}')
        print(f'grid bayes best parameters = {gs.best_params_}')
        print(f"MNB Accuracy - how often is model correct?:",metrics.accuracy_score(y_test, y_pred))
        print(f"MNB Precision - ability not to label as positive a sample that is negative:",metrics.precision_score(y_test, y_pred))
        print(f"MNB Sensitivity or Recall - ability to find all the positive samples.:",metrics.recall_score(y_test, y_pred))
                
        #turns classification report into a dictionary from which we can print other metrics
        #then returns the actual dataframe of the confusion matrix        
       
        return cm_df
    
    elif classifier == 2:
        #create SVM pipeline using Count Vectorizer and TFIDF Transfformer
        pipeline_svm = Pipeline([("wordbag",TfidfVectorizer()), #word vectorizing into weighted token integers
                                 ("classifier",svm.SVC()), #train on TF-IDF w SVC
                                ])
        #run grid search on TFIDF Vectorizer and SVM parameters
        param_grid = {"wordbag__ngram_range":[(1,1),(1,2),(2,2)],"wordbag__max_features": [25, 50, 100,1000,10000], 
                      "classifier__C":[1,10], "classifier__gamma":[0.001, 0.01, 0.1, 1], 
                      "classifier__kernel":('linear', 'rbf','sigmoid','poly')}
        
        gs = GridSearchCV(pipeline_svm, param_grid, cv=3, n_jobs=-1, scoring = "accuracy", verbose=1) 
        gs.fit(X_train, y_train)
        
        #get scores
        svm_train_score = gs.score(X_train, y_train)
        svm_test_score = gs.score(X_test, y_test)
        
        #get confusion matrix and classification metrics
        y_pred = gs.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, columns=["pred datascience","pred analytics"], index=["actual datascience","actual analytics"])
        
        #print out accuracy, estimator and parameters from GridSearchCV
        print(f'grid SVM best train score = {svm_train_score}')
        print(f'grid SVM best test score = {svm_test_score}')
        print(f'grid SVM best estimator = {gs.best_estimator_}')
        print(f'grid SVM best parameters = {gs.best_params_}')    
        #taken from datacamp tutorial: https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python
        print("SVM Accuracy - how often is model correct?:",metrics.accuracy_score(y_test, y_pred))
        print("SVM Precision - ability not to label as positive a sample that is negative:",metrics.precision_score(y_test, y_pred))
        print("SVM Sensitivity or Recall - ability to find all the positive samples.:",metrics.recall_score(y_test, y_pred))
        
        return cm_df

In [56]:
# rs = RobustScaler()
# X_train = rs.fit_transform(X_train)
# X_test = rs.transform(X_test)
    
# sm = SMOTE(random_state=42)
# X_train, y_train = sm.fit_sample(X_train, y_train)

In [59]:
grid_modeller_scorer("numberless_sentence", "automatic_renewal", 1)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    6.8s finished


grid bayes best train score = 0.9983823529411765
grid bayes best test score = 0.9982352941176471
grid bayes best estimator = Pipeline(steps=[('wordbag', TfidfVectorizer(max_features=25)),
                ('classifier', MultinomialNB())])
grid bayes best parameters = {'wordbag__max_features': 25, 'wordbag__ngram_range': (1, 1)}
MNB Accuracy - how often is model correct?: 0.9982352941176471
MNB Precision - ability not to label as positive a sample that is negative: 0.0
MNB Sensitivity or Recall - ability to find all the positive samples.: 0.0



Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



Unnamed: 0,pred_automatic_renewal,pred_other_clause
actual_automatic,3394,0
actual_other_clause,6,0


In [62]:
grid_modeller_scorer("numberless_sentence", "automatic_renewal", 2)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  5.0min finished


grid SVM best train score = 1.0
grid SVM best test score = 0.9994117647058823
grid SVM best estimator = Pipeline(steps=[('wordbag',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
                ('classifier', SVC(C=10, gamma=1, kernel='sigmoid'))])
grid SVM best parameters = {'classifier__C': 10, 'classifier__gamma': 1, 'classifier__kernel': 'sigmoid', 'wordbag__max_features': 10000, 'wordbag__ngram_range': (1, 2)}
SVM Accuracy - how often is model correct?: 0.9994117647058823
SVM Precision - ability not to label as positive a sample that is negative: 0.75
SVM Sensitivity or Recall - ability to find all the positive samples.: 1.0


Unnamed: 0,pred datascience,pred analytics
actual datascience,3392,2
actual analytics,0,6
