# [LEGALST-190] Lab 4/10: Topic Models

This lab will cover parsing XML and attribute lookup, XPath, and web scraping.

*Estimated Time: 30 Minutes *

### Topics Covered:
- implement topic model
- create document-term matrix
- discover latent topics contributing to the docs

### Table of Contents
[The Data](#section data)<br>
1 - [XML Syntax](#section 1)<br>
2 - [Using XPath and ElementTree to parse XML](#section 2)<br>
3 - [Web Scraping](#section 3)<br>
4 - [Putting it all in a dataframe](#section 4)<br>

**Dependencies:**

In [258]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from string import punctuation
from gensim import corpora, models, similarities 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import numpy as np
import pandas as pd #Imported to work with the UN Data

----
## The Data<a id='section data'></a>
Blurb about old bailey again

---

## Section 1: LDA<a id='section 1'></a>

#### First with Gensim

In [223]:
import xml.etree.cElementTree as ET
from lxml import etree

###Extra functions for extracting text from scraped XML files.
def fetchtextclean(xml_file):
    thing = []
    for element in ET.ElementTree(file=xml_file).getiterator():
        if element.tag == "p":
            for i in list(element.itertext()):
                thing += [i]
    stringed = []
    for i in thing:
        clean = i.strip().replace('\n', '')
        if clean:
            stringed.append(clean.strip('\n'))
    return stringed

def unnester(lst):
    new = []
    for i in lst:
        if type(i) == list:
            new.extend(unnester(i))
        else:
            new.append(i)
    return new

###Found online. Will give credit in citations when I complete notebook.
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = [x for x in tokens if x not in punctuation 
                       and x not in more_stops] #word tokenizer cuts the possessives
    return filtered_tokens

def tokenize_and_stem(text):
    stems = [stemmer.stem(x) for x in tokenize_only(text)]
    return stems

In [67]:
stop = stopwords.words('english')
exclude = string.punctuation
lemma = WordNetLemmatizer()

In [217]:
###Individual words from each case
corp = []
for i in np.arange(1, 314):
    text = [i for i in clean(' '.join(fetchtextclean('old-bailey/case-'+str(i)+'.xml'))).split() if i not in stop]
    corp.append(text)

In [66]:
###This is code adapted from Chris Hench's notebook. I still had to load and join the text from the XML file.
extracted = [tokenize_only(x) for x in [' '.join(fetchtextclean('old-bailey/case-'+str(i)+'.xml')) for i in range(1, 314)]]
extracted = [[stemmer.stem(x) for x in i if x not in more_stops] for i in extracted]
extracted = [[x for x in i if x not in stopwords.words("english") and x not in punctuation] for i in extracted]
###it also has individual words split from each case in their respsective lists but the words are tokenized

In [219]:
###Full text from each case
corpus = []
for i in np.arange(1, 314):
    text = ' '.join([i for i in clean(' '.join(fetchtextclean('old-bailey/case-'+str(i)+'.xml'))).split() if i not in stop])
    corpus.append(text)

In [208]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.85,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

more_stops = ['q', 's', 'm', 'transportation', 'branding', 'came', 'said']
stemmer = SnowballStemmer("english")

tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) #fit the vectorizer to the summaries

In [256]:
dictionary = corpora.Dictionary(extracted)

#dictionary.filter_extremes(no_below=50, no_above=.7)

new = [dictionary.doc2bow(i) for i in corp]

In [257]:
lda = models.LdaModel(new, num_topics=5,
                            id2word=dictionary, 
                            chunksize=25, 
                            update_every=5,
                            passes=10)

lda.show_topics()

[(0,
  '0.018*"child" + 0.017*"saw" + 0.012*"upon" + 0.012*"handkerchief" + 0.011*"went" + 0.010*"two" + 0.009*"woman" + 0.009*"time" + 0.008*"mr" + 0.008*"mother"'),
 (1,
  '0.021*"mr" + 0.014*"went" + 0.013*"upon" + 0.012*"man" + 0.012*"time" + 0.011*"two" + 0.011*"one" + 0.011*"year" + 0.010*"know" + 0.010*"money"'),
 (2,
  '0.025*"tankard" + 0.012*"prosecutor" + 0.012*"taylor" + 0.011*"fletcher" + 0.010*"never" + 0.010*"cloth" + 0.009*"court" + 0.008*"bibbey" + 0.008*"humberston" + 0.008*"jane"'),
 (3,
  '0.024*"went" + 0.015*"watch" + 0.014*"mr" + 0.014*"took" + 0.013*"saw" + 0.012*"man" + 0.011*"would" + 0.011*"go" + 0.011*"two" + 0.010*"door"'),
 (4,
  '0.043*"one" + 0.017*"good" + 0.014*"pair" + 0.014*"thing" + 0.014*"went" + 0.014*"see" + 0.013*"two" + 0.012*"took" + 0.012*"john" + 0.012*"val"')]

### Sci-kit learn LDA test

In [248]:
#taken from article on medium (https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730)
#just wanted to test it using sk and see if there were any differences
#it seems like the results are slightly better but I'm still not sure
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_features = 2000

In [249]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9,
                                 min_df=.1, stop_words='english',
                                 use_idf=True)#, ngram_range=(1,5))
tf = tfidf_vectorizer.fit_transform(corpus)
tf_feature_names = tfidf_vectorizer.get_feature_names()

In [250]:
# Run LDA
lda = LatentDirichletAllocation(n_components=4, max_iter=3, 
                                learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf)

no_top_words = 13
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
prisoner property john stealing handkerchief value money transportation thomas said summary guilty prosecutor
Topic 1:
john prisoner feb went pound stealing money thomas good house property said acquitted
Topic 2:
sheet linen value val certain mary widow lodging good let linnen stealing 12
Topic 3:
said prisoner went came watch house man mr took saw asked thing money


----
### UN Data

In [6]:
un = pd.read_csv('un-general-debates.csv')

In [252]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

more_stops = ['q', 's', 'm', 'transportation', 'branding']
stemmer = SnowballStemmer("english")

fitter = tfidf_vectorizer.fit_transform(un[un['session'] == 44]['text'].values) #fit the vectorizer to the summaries

In [None]:
un_t = [tokenize_only(x) for x in un[un['session'] == 44]['text'].values] #Taking all text from 44th session
un_t = [[stemmer.stem(x) for x in i if x not in more_stops] for i in un_t]
un_t = [[x for x in i if x not in stopwords.words("english") and x not in punctuation] for i in un_t]

In [254]:
#create a Gensim dictionary from the texts
un_dictionary = corpora.Dictionary(un_t)

un_dictionary.filter_extremes(no_below=10)

un_d_t = [un_dictionary.doc2bow(i) for i in un_t]

In [255]:
un_lda = models.LdaModel(un_d_t, num_topics=3,
                            id2word=un_dictionary, 
                            chunksize=25, 
                            update_every=10,
                            passes=5)

un_lda.show_topics()

[(0,
  '0.005*"cambodia" + 0.003*"cambodian" + 0.003*"iraq" + 0.003*"iran" + 0.003*"cyprus" + 0.003*"occup" + 0.003*"interfer" + 0.003*"1988" + 0.002*"viet" + 0.002*"angola"'),
 (1,
  '0.006*"europ" + 0.005*"democraci" + 0.004*"american" + 0.004*"european" + 0.003*"latin" + 0.003*"traffick" + 0.003*"individu" + 0.003*"terror" + 0.003*"violenc" + 0.002*"reform"'),
 (2,
  '0.003*"peace-keep" + 0.003*"traffick" + 0.003*"democraci" + 0.003*"small" + 0.003*"centuri" + 0.002*"oper" + 0.002*"report" + 0.002*"strong" + 0.002*"japan" + 0.002*"fee"')]