In [20]:
from gensim import corpora, models, similarities
from gensim import corpora
import gensim
import pandas as pd
import numpy as np
import os
from glob import glob
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
import sys
sys.path.append(os.getcwd())

### part 1 : Store the bigram info in sparse matrix

In [14]:
# Use scripts /processing/bigram_feature.py produce two files:
# 1. rawdata/judge2exist_bigram.pkl
# 2. rawdata/judge2count_bigram.pkl

judge2exist_bigram = pd.read_pickle('rawdata/judge2exist_bigram.pkl')
judge2count_bigram = pd.read_pickle('rawdata/judge2count_bigram.pkl')
word2id = pd.read_pickle('rawdata/word2id-2.pkl')

In [None]:
# Build a Dictionary Class in gensim package 
# Transform the original word2id 
judge_bigram_dict = corpora.Dictionary()
judge_bigram_dict.token2id.update(word2id) 

In [15]:
# Transform the bigram into sparse matrix
# This is not the final output
v = DictVectorizer(sparse=True)
judge2exist_bigram_matrix = \
    v.fit_transform(list(judge2exist_bigram.values()))
judge2count_bigram_matrix = \
    v.fit_transform(list(judge2count_bigram.values()))

In [16]:
# Transform the sparse matrix into corpus.
# This will save a lot of space and is very efficent to be transformed into sparse matrix
judge2exist_bigram_corpus = \
    gensim.matutils.Sparse2Corpus(judge2exist_bigram_matrix)
judge2count_bigram_corpus = \
    gensim.matutils.Sparse2Corpus(judge2count_bigram_matrix)

In [17]:
# Store all result
corpora.MmCorpus.serialize('rawdata/judge2count_bigram_corpus.mm', judge2count_bigram_corpus)
corpora.MmCorpus.serialize('rawdata/judge2exist_bigram_corpus.mm', judge2exist_bigram_corpus)
judge_bigram_dict.save('rawdata/judge_bigram_dict.dict')

### part 2 : Reduce dimension & LDA

In [23]:
## Transform corpus into tfidf sequence
tfidf = models.TfidfModel(judge2count_bigram_corpus)
judge2count_bigram_corpus_tfidf = tfidf[judge2count_bigram_corpus]

for i, j in enumerate(judge2count_bigram_corpus_tfidf):
    if i < 1:
        print(j)

[(81, 0.14379461029531085), (135, 0.24702670149793934), (224, 0.31341971548801578), (293, 0.18005769888600365), (314, 0.32298572486773902), (355, 0.17591398613276835), (396, 0.16863834016685586), (471, 0.20723680930709634), (528, 0.19212069331379419), (532, 0.20677254408067597), (547, 0.14846813320564564), (669, 0.18816902079619865), (716, 0.1372423967148651), (744, 0.14209453940340666), (832, 0.35717832548497586), (867, 0.088009951321526475), (921, 0.17440149164067467), (947, 0.12318291796659869), (1017, 0.12154964579983747), (1050, 0.19126321248965694), (1074, 0.15573675689583219), (1075, 0.11803053175591176), (1091, 0.21364711057572031), (1131, 0.13426834884096939), (1234, 0.16072621997209127), (1248, 0.18449835492756089), (1312, 0.10419237715512651)]


In [24]:
k = 50 # topic num, intutivly , it can be set as classes num of judge cases 
# LSI transformation 
lsi = models.LsiModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k) 
judge2count_bigram_corpus_lsi = lsi[judge2count_bigram_corpus_tfidf]

In [32]:
printitem(judge2count_bigram_corpus_lsi)
# need multi test ,so no need to save (just save the best)
# lsi.save('rawdata/judge2count_bigram_lsi.lsi')  

[(0, 0.22702029167858667), (1, 0.11576988922128108), (2, -0.12930069061453034), (3, -0.085109293139851785), (4, -0.0037917176948431665), (5, -0.017327180982774061), (6, -0.14393420748913677), (7, -0.12201518922598323), (8, -0.0063732500092567156), (9, -0.044250939409882888), (10, 0.072021048957832234), (11, -0.025669667193074154), (12, -0.040092587036761912), (13, 0.12781107061898239), (14, -0.0027268898335965136), (15, 0.036441291710643392), (16, -0.00018198259555313179), (17, 0.043685747382247619), (18, 0.01795400993104581), (19, 0.10392265500653677), (20, 0.014197782756762254), (21, -0.0084204061106709083), (22, -0.04093197912835974), (23, 0.035751124923679052), (24, 0.03493244883871429), (25, -0.014040355865578233), (26, 0.0020649800369672922), (27, 0.0203647931276521), (28, -0.026537192851468799), (29, -0.01485693018573068), (30, -0.03698325553836139), (31, -0.010912860194156106), (32, 0.047440556270716021), (33, 0.0090172906548815247), (34, -0.015265682288138765), (35, 0.01675074

In [33]:
# Random Projections, RP similiar to LDA
rp = models.RpModel(judge2count_bigram_corpus_tfidf, id2word= judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_RP = rp[judge2count_bigram_corpus_tfidf]

In [34]:
printitem(judge2count_bigram_corpus_RP)

[(0, 0.22702029167858667), (1, 0.11576988922128108), (2, -0.12930069061453034), (3, -0.085109293139851785), (4, -0.0037917176948431665), (5, -0.017327180982774061), (6, -0.14393420748913677), (7, -0.12201518922598323), (8, -0.0063732500092567156), (9, -0.044250939409882888), (10, 0.072021048957832234), (11, -0.025669667193074154), (12, -0.040092587036761912), (13, 0.12781107061898239), (14, -0.0027268898335965136), (15, 0.036441291710643392), (16, -0.00018198259555313179), (17, 0.043685747382247619), (18, 0.01795400993104581), (19, 0.10392265500653677), (20, 0.014197782756762254), (21, -0.0084204061106709083), (22, -0.04093197912835974), (23, 0.035751124923679052), (24, 0.03493244883871429), (25, -0.014040355865578233), (26, 0.0020649800369672922), (27, 0.0203647931276521), (28, -0.026537192851468799), (29, -0.01485693018573068), (30, -0.03698325553836139), (31, -0.010912860194156106), (32, 0.047440556270716021), (33, 0.0090172906548815247), (34, -0.015265682288138765), (35, 0.01675074

In [None]:
# LDA topic model Very slow, not cross validation
LDA = models.LdaModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_lda = LDA[judge2count_bigram_corpus_tfidf]

In [None]:
printitem(judge2count_bigram_corpus_lda)

### part 3. Train with new features

### Aux function

In [30]:
def printitem(corpus,num = 1):
    for i, j in enumerate(judge2count_bigram_corpus_lsi):
        if i < num:
            print(j)
            break 