# Importing necessary libraries for the project 

In [1]:
import numpy as np
import pandas as pd 
import re 
import glob 
import xmltodict, json
import string 
import nltk
import gensim
import itertools
from collections import Counter, defaultdict
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.tfidfmodel import TfidfModel
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora


# Defining a function that extracts the content in the xml, converts into JSON and returns string as case_name_text and case_sentence_text 

Now, the xml file we are loading contains some error files which needed to be fixed. 

At first, when I tried loading it into pandas using the xmlparse library, I encountered an error. After validating the xml file online, I found that catchphrase tag had an error. 

So, I read the file as a text and split the text on the word "catchphrases" and joined the strings. Then, xml file doesn't support ampersand which needed to be taken care of. 

Now, for returning the clean text, I also remove the end of line characters. 

In [2]:
def open_file(file):
    fopen = open(file).read()
    fopen.split('catchphrases')[0]
    clean_xml = fopen.split('catchphrases')[0][:-1] + fopen.split('catchphrases')[2][1:]
    
    text = ''
    for i in clean_xml:
        i = i.lower()
        i = i.replace('&', '')
        i = i.replace('\n', '')
        text += i
    
    o = xmltodict.parse(text)
    text_injson_clean = json.dumps(o) 
    df = pd.read_json(text_injson_clean)   

    for i in df.iloc[2,:]:
        sentences = i['sentence']
    
    final_sentence = ''.join([i['#text'] for i in sentences])

    for j in df.iloc[1,:]:
        name = j[:]
    return final_sentence, name

# Reading all the files that endswith .xml extension inside full text folder and loading it into pandas 

Now, for importing multiple xml file, I used glob function. 

This helped to fetch all the files that endswith ".xml". After loading the xml file, passing it into open_file function which returns the list of the sentences text and list of the name of all cases. 

Than, I used regular expressions to remove all the digits and numbers from the text and also defined new column which has length of all the characters of the text which in short gives us length of each case. 

In [3]:
case_name = []
case_data = []
for file in glob.glob('/Users/deeplakkad/Downloads/corpus/fulltext/*.xml'):
    cs_str, cs_name = open_file(file)
    case_name.append(cs_name)
    case_data.append(cs_str)

pd.set_option('display.max_colwidth', 100)
df = pd.DataFrame({'case_name': case_name, 'case_data': case_data})
df['case_data'] = df['case_data'].apply(lambda x: re.sub('\d+', '', x))
df['case_length'] = df['case_data'].apply(lambda x: len(x) - x.count(' '))
df.head(2)
print(df.shape)
df.head()

(0, 3)


Unnamed: 0,case_data,case_name,case_length


# Removing the punctuation from the sentence text 

Now, just for the sake of simplicity, I created a new function which helps us to remove the punctuation characters in the text 

In [4]:
 def remove_punc(text):
    text_nopun = ''.join([char for char in text if char not in string.punctuation])
    return text_nopun

df['case_data'] = df['case_data'].apply(lambda x: remove_punc(x))
df.head()

Unnamed: 0,case_data,case_name,case_length
0,on april the application brought by the university of western australia uwa against dr bruce ...,university of western australia v gray (no 21) [2008] fca 1056 (16 july 2008),25406
1,introduction the applicant seeks review of the respondents decision under s of the administrat...,parker v parker [2009] fca 930 (25 august 2009),23995
2,the applicant csl limited csl seeks an interlocutory injunction to restrain the respondent glax...,csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006),46188
3,background to the appeal this is an appeal against orders of the federal magistrates court of ...,field v st george bank limited [2009] fca 1042 (17 september 2009),51026
4,this matter came on for hearing before me yesterday on referral from the district registrars co...,deputy commissioner of taxation v bk ganter holdings pty ltd [2008] fca 1730 (18 november 2008),18401


# Converting the sentence text into the list of individual words by tokenization

Then, the next step is to perform tokenization. So, basically this splits the text separated by comma and returns the list of individual words. 

In [5]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['cd_tokenize'] = df['case_data'].apply(lambda x: tokenize(x.lower()))
df.head()

Unnamed: 0,case_data,case_name,case_length,cd_tokenize
0,on april the application brought by the university of western australia uwa against dr bruce ...,university of western australia v gray (no 21) [2008] fca 1056 (16 july 2008),25406,"[, on, april, the, application, brought, by, the, university, of, western, australia, uwa, again..."
1,introduction the applicant seeks review of the respondents decision under s of the administrat...,parker v parker [2009] fca 930 (25 august 2009),23995,"[introduction, the, applicant, seeks, review, of, the, respondents, decision, under, s, of, the,..."
2,the applicant csl limited csl seeks an interlocutory injunction to restrain the respondent glax...,csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006),46188,"[, the, applicant, csl, limited, csl, seeks, an, interlocutory, injunction, to, restrain, the, r..."
3,background to the appeal this is an appeal against orders of the federal magistrates court of ...,field v st george bank limited [2009] fca 1042 (17 september 2009),51026,"[background, to, the, appeal, this, is, an, appeal, against, orders, of, the, federal, magistrat..."
4,this matter came on for hearing before me yesterday on referral from the district registrars co...,deputy commissioner of taxation v bk ganter holdings pty ltd [2008] fca 1730 (18 november 2008),18401,"[, this, matter, came, on, for, hearing, before, me, yesterday, on, referral, from, the, distric..."


# Removing all the common words in the text i.e. Dropping stopwords

It is very important to remove the stopwords (i.e. the, by, on, etc.) because these words don't add information to our text that we are trying to get from it. 

In [6]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

stopword = nltk.corpus.stopwords.words('english')
df['cd_no_stop'] = df['cd_tokenize'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,case_data,case_name,case_length,cd_tokenize,cd_no_stop
0,on april the application brought by the university of western australia uwa against dr bruce ...,university of western australia v gray (no 21) [2008] fca 1056 (16 july 2008),25406,"[, on, april, the, application, brought, by, the, university, of, western, australia, uwa, again...","[, april, application, brought, university, western, australia, uwa, dr, bruce, gray, sirtex, me..."
1,introduction the applicant seeks review of the respondents decision under s of the administrat...,parker v parker [2009] fca 930 (25 august 2009),23995,"[introduction, the, applicant, seeks, review, of, the, respondents, decision, under, s, of, the,...","[introduction, applicant, seeks, review, respondents, decision, administrative, decisions, judic..."
2,the applicant csl limited csl seeks an interlocutory injunction to restrain the respondent glax...,csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006),46188,"[, the, applicant, csl, limited, csl, seeks, an, interlocutory, injunction, to, restrain, the, r...","[, applicant, csl, limited, csl, seeks, interlocutory, injunction, restrain, respondent, glaxosm..."
3,background to the appeal this is an appeal against orders of the federal magistrates court of ...,field v st george bank limited [2009] fca 1042 (17 september 2009),51026,"[background, to, the, appeal, this, is, an, appeal, against, orders, of, the, federal, magistrat...","[background, appeal, appeal, orders, federal, magistrates, court, june, making, sequestration, o..."
4,this matter came on for hearing before me yesterday on referral from the district registrars co...,deputy commissioner of taxation v bk ganter holdings pty ltd [2008] fca 1730 (18 november 2008),18401,"[, this, matter, came, on, for, hearing, before, me, yesterday, on, referral, from, the, distric...","[, matter, came, hearing, yesterday, referral, district, registrars, corporations, list, novembe..."


# Performing Lemmatization for converting the words to their roots using machine learning approach 

Now, there are two methods to check each word individually and return the root of that word. 

First one is stemming and second one is lemmatization. 

Although stemming is faster, it doesn't take the context of the word into account and replace with the root word whereas lemmatization is a more enhanced approach which takes the word into account and replace the word to it's root by taking the context into consideration. 

However, the two words differ in their flavor. 

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. 

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . 

If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. 

The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma.

In [7]:
def lemmatize(tokenized_nostop):
    text = [wn.lemmatize(word) for word in tokenized_nostop]
    return text

wn = nltk.WordNetLemmatizer()

df['cd_lemmatized'] = df['cd_no_stop'].apply(lambda x: lemmatize(x))

df.head()

Unnamed: 0,case_data,case_name,case_length,cd_tokenize,cd_no_stop,cd_lemmatized
0,on april the application brought by the university of western australia uwa against dr bruce ...,university of western australia v gray (no 21) [2008] fca 1056 (16 july 2008),25406,"[, on, april, the, application, brought, by, the, university, of, western, australia, uwa, again...","[, april, application, brought, university, western, australia, uwa, dr, bruce, gray, sirtex, me...","[, april, application, brought, university, western, australia, uwa, dr, bruce, gray, sirtex, me..."
1,introduction the applicant seeks review of the respondents decision under s of the administrat...,parker v parker [2009] fca 930 (25 august 2009),23995,"[introduction, the, applicant, seeks, review, of, the, respondents, decision, under, s, of, the,...","[introduction, applicant, seeks, review, respondents, decision, administrative, decisions, judic...","[introduction, applicant, seek, review, respondent, decision, administrative, decision, judicial..."
2,the applicant csl limited csl seeks an interlocutory injunction to restrain the respondent glax...,csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006),46188,"[, the, applicant, csl, limited, csl, seeks, an, interlocutory, injunction, to, restrain, the, r...","[, applicant, csl, limited, csl, seeks, interlocutory, injunction, restrain, respondent, glaxosm...","[, applicant, csl, limited, csl, seek, interlocutory, injunction, restrain, respondent, glaxosmi..."
3,background to the appeal this is an appeal against orders of the federal magistrates court of ...,field v st george bank limited [2009] fca 1042 (17 september 2009),51026,"[background, to, the, appeal, this, is, an, appeal, against, orders, of, the, federal, magistrat...","[background, appeal, appeal, orders, federal, magistrates, court, june, making, sequestration, o...","[background, appeal, appeal, order, federal, magistrate, court, june, making, sequestration, ord..."
4,this matter came on for hearing before me yesterday on referral from the district registrars co...,deputy commissioner of taxation v bk ganter holdings pty ltd [2008] fca 1730 (18 november 2008),18401,"[, this, matter, came, on, for, hearing, before, me, yesterday, on, referral, from, the, distric...","[, matter, came, hearing, yesterday, referral, district, registrars, corporations, list, novembe...","[, matter, came, hearing, yesterday, referral, district, registrar, corporation, list, november,..."


# Select the case and find the most occurring words in that case 

Now, counter creates the dictionary of each word as key and frequency as it's value. This helps to identify the words with most occurrences in our text 

In [8]:
case_number = int(input('Enter the case number: '))
counter = Counter(df['cd_lemmatized'][case_number])
counter.most_common(10)

Enter the case number: 2


[('gska', 104),
 ('cervarix', 79),
 ('csl', 76),
 ('information', 68),
 ('gardasil', 65),
 ('hpv', 64),
 ('vaccine', 62),
 ('sheet', 53),
 ('cancer', 51),
 ('would', 50)]

# Using gensim for creating a bag of words

Now, I have used gensim to create a bag of words. It takes unique words into our tokenized list and converts into numbers to perform mathematical operations like SVM, Naive Bayes for classification problem, etc. or to perform TF-IDF method as equations can be performed on numbers rather than on text. So, this helps us to create a corpus and each unique word is identified in terms of 0,1,2, etc. and value of occurence in a given sentence. 


The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “Human computer interaction”, the words computer (id 0) and human (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.

In [9]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in df['case_data']]
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Using TF-IDF method on our corpus

A VERY GOOD DEFINITION: http://www.tfidf.com/

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

Consider a document containing 100 words wherein the word cat appears 3 times. The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.



In [10]:
case_number = int(input('Select the case number: '))
case_name = df['case_name'][case_number]
doc = corpus[case_number]
print('Name: {}'.format(case_name))
tfidf = TfidfModel(corpus)

tfidf_weights = tfidf[doc]

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 10 weighted words
for term_id, weight in sorted_tfidf_weights[:10]:
    print(dictionary.get(term_id), weight)

Select the case number: 2
Name: csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006)
gska 0.552368427388354
cervarix 0.4195875554199997
gardasil 0.34523026711772126
hpv 0.33991903223898706
csl 0.3020946312661078
vaccine 0.2164461311281897
cervical 0.14704101188738186
tga 0.11582977313479494
cancer 0.10678933379412729
gskas 0.10091346269594928



# Topic Modeling - Implementing LDA on our corpus 

WIKIPEDIA LINK: https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In natural language processing, latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's creation is attributable to one of the document's topics

In [22]:
case_numbers = input('Select the case numbers you want to study separated by comma: ')

list_of_cases = case_numbers.split(',')
for i in list_of_cases:
    i = int(i)
    j = i
    corpus = [dictionary.doc2bow(doc) for doc in df['cd_lemmatized'][i-1:j]]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, update_every=4, chunksize=10000, passes=1)
    print('Case : {}\t{}'.format(j,df['case_name'][i]))
    topic = re.sub('\d+\W+','',lda.print_topics(1)[0][1]).split('+')
    print('Keywords: {}'.format(topic))
    print('\n')

Select the case numbers you want to study separated by comma: 2
Case : 2	csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006)
Keywords: ['decision" ', ' act" ', ' applicant" ', ' tax" ', ' made" ', ' respondent" ', ' review" ', ' exercise" ', ' commissioner" ', ' adjr"']




# Applying Parts of Speech tagger

Now, as noticed in the above output generated by Latent Dirichlet Allocation, we can see that are words that includes NOUN i.e. 'applicant', 'respondent', etc. which aren't useful keywords for us because they tell nothing about the article. 

So, after applying pos_tag, we can see that model has highly improvised. 

In [23]:
def pos_tagging(case_number):
    data = []
    cd_tokenize = []
    tags = ['NNP', 'VBD']

    for word in nltk.sent_tokenize(df.case_data[case_number]):
        data = data + nltk.pos_tag(nltk.word_tokenize(word))

        for word in data:
            if word[1] in tags:
                cd_tokenize.append('')
            else:
                cd_tokenize.append(word[0])
    cd_no_stop = remove_stopwords(cd_tokenize)
    cd_lemmatize = lemmatize(cd_no_stop)
    terms = [cd_lemmatize]
    corpus = [dictionary.doc2bow(doc) for doc in terms]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, update_every=4, chunksize=10000, passes=1)
    print('Case : \t{}'.format(df['case_name'][case_number]))
    topic = re.sub('\d+\W+','',lda.print_topics(1)[0][1]).split('+')
    print('Keywords: {}'.format(topic))
    print('\n')
    
        
case_number = int(input('Select the case number: '))
pos_tagging(case_number)

Select the case number: 2
Case : 	csl limited v glaxosmithkline australia pty ltd [2006] fca 1301 (3 october 2006)
Keywords: ['gska" ', ' hpv" ', ' csl" ', ' information" ', ' cervarix" ', ' sheet" ', ' vaccine" ', ' gardasil" ', ' cancer" ', ' cervical"']




# Future Goal: 

First: 

PERFORMING BIGRAMS: 

We can perform Bigrams for the sentence text in each case and check the occurrence of words that are in pair. This can give us more information about the case. 

Second: 

We can remove the words that are smaller than 3 characters in length. This will help us to identify if there are any single character words such as 'c', 'mr', 'dr', etc. 

I haven't used because sometimes we use 'br' for bankrupty which is important. So, after identify the case we will be more to implement it as well. 

Third: 

Also, due to time constraint and performing two project, unfortunately I was not able to identify more POS TAG features and so I have only removed the NNP for an instance which will at least help remove naming words such as 'george' or words such as 'applicant', 'respondent', etc. 

So, we can see more POS TAG features as well. 