In [97]:
from gensim import corpora, models, similarities
from gensim import corpora
import gensim
import pandas as pd
import numpy as np
import os
from glob import glob
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
import sys
sys.path.append(os.getcwd())

### part 1 : Store the bigram info in sparse matrix

In [2]:
# Use scripts /processing/bigram_feature.py produce two files:
# 1. rawdata/judge2exist_bigram.pkl
# 2. rawdata/judge2count_bigram.pkl

judge2exist_bigram = pd.read_pickle('../rawdata/judge2exist_bigram.pkl')
judge2count_bigram = pd.read_pickle('../rawdata/judge2count_bigram.pkl')
word2id = pd.read_pickle('../rawdata/word2id-2.pkl')

In [3]:
# Build a Dictionary Class in gensim package 
# Transform the original word2id 
judge_bigram_dict = corpora.Dictionary()
judge_bigram_dict.token2id.update(word2id) 

In [19]:
# Transform the bigram into sparse matrix
# This is not the final output
v = DictVectorizer(sparse=False)
judge2exist_bigram_matrix = \
    v.fit_transform(judge2exist_bigram.values())
judge2count_bigram_matrix = \
    v.fit_transform(judge2count_bigram.values())


In [20]:
# Transform the sparse matrix into corpus.
# This will save a lot of space and is very efficent to be transformed into sparse matrix
judge2exist_bigram_corpus = \
    gensim.matutils.Dense2Corpus(judge2exist_bigram_matrix.T)
judge2count_bigram_corpus = \
    gensim.matutils.Dense2Corpus(judge2count_bigram_matrix.T)

In [24]:
# Store all result
corpora.MmCorpus.serialize('../rawdata/judge2count_bigram_corpus.mm', judge2count_bigram_corpus)
corpora.MmCorpus.serialize('../rawdata/judge2exist_bigram_corpus.mm', judge2exist_bigram_corpus)
judge_bigram_dict.save('../rawdata/judge_bigram_dict.dict')

### part 2 : Reduce dimension & LDA

In [35]:
## Transform corpus into tfidf sequence
tfidf = models.TfidfModel(judge2count_bigram_corpus)
judge2count_bigram_corpus_tfidf = tfidf[judge2count_bigram_corpus]

printitem(judge2count_bigram_corpus_tfidf)

[(10, 0.0045885772370682911), (41, 0.0029312132672302416), (69, 0.018780266232666581), (105, 0.00056813914195320836), (157, 0.0012223489724504869), (183, 0.0029284121355802029), (186, 0.0039173579744159314), (210, 0.0032685596420711357), (223, 0.0013444403696192331), (228, 0.0021024314124816154), (247, 0.0014953240788362168), (289, 0.0041078976921541371), (299, 0.00088199282108117479), (305, 0.00021111391574777378), (306, 0.004233124999207802), (308, 0.00036051401299145935), (325, 0.0011395261589776298), (375, 0.0038312354255007619), (389, 0.0025424752762282839), (408, 0.0023898234377130942), (411, 0.0032210459133258365), (436, 0.0086170674975708308), (437, 0.0018305044152123018), (467, 0.011699722674548559), (469, 0.0054835904993650276), (476, 0.0023983309190832502), (505, 0.0017988899462724008), (507, 0.0047899587986131925), (513, 0.00081530506955443242), (538, 0.0026133889797732165), (540, 0.0013090363602654856), (541, 0.0037725766031991418), (551, 0.0036431940418002527), (569, 0.00

In [36]:
k = 15 # topic num, intutivly , it can be set as classes num of judge cases 
k

15

In [None]:
 
# LSI transformation 
lsi = models.LsiModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k) 
judge2count_bigram_corpus_lsi = lsi[judge2count_bigram_corpus_tfidf]

In [38]:
printitem(judge2count_bigram_corpus_lsi)
# need multi test ,so no need to save (just save the best)
# lsi.save('rawdata/judge2count_bigram_lsi.lsi')  

[(0, 0.22702089203507672), (1, 0.1161425599617855), (2, -0.13088461711903801), (3, -0.080007449247289961), (4, -0.0014506499677065256), (5, -0.020984207371261707), (6, -0.14781674019120772), (7, -0.10839487494378698), (8, -0.019035355035598323), (9, -0.034177055410645418), (10, 0.097467017195790956), (11, -0.020935647778118158), (12, 0.060459156032972534), (13, -0.08919619874615381), (14, -0.020535111708430996), (15, 0.043004962987314176), (16, -0.017602612425187732), (17, 0.031188001694523237), (18, 0.060788215540561651), (19, -0.077357602256327956), (20, 0.020582840615583919), (21, -0.0019986115974645405), (22, 0.10500346366267059), (23, -0.006180529809155893), (24, 0.032678694031886397), (25, -0.032487694140800202), (26, -0.002972225835906716), (27, -0.0095357552148094826), (28, -0.034521964320233821), (29, -0.02507920424659164), (30, -0.013328883356515761), (31, -0.025125381684979783), (32, -0.01202617164325373), (33, -0.02356698631466712), (34, 0.016840833341442877), (35, 0.037535

In [38]:
# Random Projections, RP similiar to LDA
rp = models.RpModel(judge2count_bigram_corpus_tfidf, id2word= judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_RP = rp[judge2count_bigram_corpus_tfidf]

In [39]:
printitem(judge2count_bigram_corpus_RP)

[(0, 0.018067317083477974), (1, 0.014952360652387142), (2, 0.14919982850551605), (3, -0.01983129419386387), (4, -0.2852846384048462), (5, 0.09467584639787674), (6, -0.2127978354692459), (7, -0.16650311648845673), (8, 0.029342180117964745), (9, 0.38679832220077515), (10, -0.12331011891365051), (11, 0.30526137351989746), (12, 0.39013203978538513), (13, -0.03866221383213997), (14, 0.016050368547439575)]
[(0, -0.22670982778072357), (1, -0.24100913107395172), (2, -0.10226882249116898), (3, -0.25399720668792725), (4, -0.18078108131885529), (5, -0.698803722858429), (6, -0.3115491569042206), (7, 0.0640162006020546), (8, 0.03255718946456909), (9, 0.1393597424030304), (10, -0.0928981676697731), (11, 0.20930634438991547), (12, -0.2824334502220154), (13, 0.011020183563232422), (14, -0.020897114649415016)]
[(0, 0.1887323260307312), (1, -0.10662124305963516), (2, -0.16487562656402588), (3, 0.19730235636234283), (4, 0.33693891763687134), (5, -0.20291918516159058), (6, 0.3520713150501251), (7, -0.0237

In [35]:
# LDA topic model Very slow, not cross validation
LDA = models.LdaModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_lda = LDA[judge2count_bigram_corpus_tfidf]

In [None]:
printitem(judge2count_bigram_corpus_lda)

### part 3. Train with new features

In [115]:
data = pd.read_csv('../Holger_train.csv',decimal=',',index_col=0)
test = pd.read_csv('../Holger_test.csv',decimal=',',index_col=0)
data.head()

Unnamed: 0,trial,newcit,nocounts,judgeid,malejudge,judge_yearsonbench,demean_logsenttot,year1,year2,year3,...,cr13,cr14,cr15,cr16,cr17,cr18,cr19,cr20,cr21,cr22
92793,0.0,0.0,1.0,"BARBOUR, WILLIAM H., JR.",1.0,21.0,-1.188110589981079,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89902,0.0,1.0,2.0,"BERRIGAN, HELEN G.",0.0,16.0,-1.0816633701324463,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213938,0.0,1.0,1.0,"MURGUIA, CARLOS",1.0,9.0,-0.8778231143951416,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119446,0.0,0.0,3.0,"JOHNSTONE, EDWARD H.",1.0,28.0,0.9541592597961426,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29200,0.0,0.0,3.0,"DAVIS, LEGROME D.",1.0,3.0,0.7817468643188477,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
judgeid2row ={k: i for i,k in enumerate(judge2count_bigram.keys()) }

In [47]:
#judge2count_bigram_corpus_RP
#corpus = corpora.MmCorpus('../rawdata/judge2count_bigram_corpus.mm')
count_bigram_corpus_matrix = np.zeros([1329, k])
for i, j in enumerate(judge2count_bigram_corpus_RP):
    item = [v[1] for v in j]
    count_bigram_corpus_matrix[i,:] = item


In [78]:
count_bigram_corpus_matrix[0,:]


array([ 0.01806732,  0.01495236,  0.14919983, -0.01983129, -0.28528464,
        0.09467585, -0.21279784, -0.16650312,  0.02934218,  0.38679832,
       -0.12331012,  0.30526137,  0.39013204, -0.03866221,  0.01605037])

In [120]:
topic_featuresDF = pd.DataFrame(count_bigram_corpus_matrix, columns=['topic'+str(i) for i in range(k)])
topic_featuresDF['judgeid'] = np.array(list(judge2count_bigram.keys()))
Train_data = pd.merge(data, topic_featuresDF, how='inner', on='judgeid')
#print(Train_data.describe())

Test_data = pd.merge(test, topic_featuresDF, how='inner', on='judgeid')
#print(Test_data.describe())

In [134]:
X_train = pd.concat([Train_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Train_data.loc[:,'year2':'topic14']], axis = 1,ignore_index=True)
X_test =  pd.concat([Test_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Test_data.loc[:,'year2':'topic14']], axis = 1,ignore_index=True)
y_train = Train_data.loc[:,'demean_logsenttot'].values.astype(np.float32)
y_test = Test_data.loc[:,'demean_logsenttot'].values.astype(np.float32)

In [135]:
regr_rf = RandomForestRegressor(max_depth=30)
regr_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

array([ 0.43592501,  0.74713755, -1.06815219, ..., -1.8982569 ,
       -0.34765935,  0.02703404], dtype=float32)

In [138]:
y_pred = regr_rf.predict(X_test)

error = np.sum(np.square(y_pred - y_test))/len(y_test)
print(error)

1.30242092899


### Aux function

In [12]:
def printitem(corpus,num = 1):
    for i, j in enumerate(corpus):
        print(j)
        if i > num:
            break 