In [23]:
from gensim import corpora, models, similarities
from gensim import corpora
import gensim
import pandas as pd
import numpy as np
import os
from glob import glob
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import sys
sys.path.append(os.getcwd())

### part 1 : Store the bigram info in sparse matrix

In [16]:
# Use scripts /processing/bigram_feature.py produce two files:
# 1. rawdata/judge2exist_bigram.pkl
# 2. rawdata/judge2count_bigram.pkl

judge2exist_bigram = pd.read_pickle('../rawdata/judge2exist_bigram.pkl')
judge2count_bigram = pd.read_pickle('../rawdata/judge2count_bigram.pkl')
word2id = pd.read_pickle('../rawdata/word2id-2.pkl')

In [3]:
# Build a Dictionary Class in gensim package 
# Transform the original word2id 
judge_bigram_dict = corpora.Dictionary()
judge_bigram_dict.token2id.update(word2id) 

In [19]:
# Transform the bigram into sparse matrix
# This is not the final output
v = DictVectorizer(sparse=False)
judge2exist_bigram_matrix = \
    v.fit_transform(judge2exist_bigram.values())
judge2count_bigram_matrix = \
    v.fit_transform(judge2count_bigram.values())


In [20]:
# Transform the sparse matrix into corpus.
# This will save a lot of space and is very efficent to be transformed into sparse matrix
judge2exist_bigram_corpus = \
    gensim.matutils.Dense2Corpus(judge2exist_bigram_matrix.T)
judge2count_bigram_corpus = \
    gensim.matutils.Dense2Corpus(judge2count_bigram_matrix.T)

In [24]:
# Store all result
corpora.MmCorpus.serialize('../rawdata/judge2count_bigram_corpus.mm', judge2count_bigram_corpus)
corpora.MmCorpus.serialize('../rawdata/judge2exist_bigram_corpus.mm', judge2exist_bigram_corpus)
judge_bigram_dict.save('../rawdata/judge_bigram_dict.dict')

### part 2 : Reduce dimension & LDA

In [10]:
## Transform corpus into tfidf sequence
judge2count_bigram_corpus = corpora.MmCorpus('../rawdata/judge2count_bigram_corpus.mm')
judge_bigram_dict = corpora.Dictionary.load('../rawdata/judge_bigram_dict.dict')

In [None]:
tfidf = models.TfidfModel(judge2count_bigram_corpus)
judge2count_bigram_corpus_tfidf = tfidf[judge2count_bigram_corpus]

printitem(judge2count_bigram_corpus_tfidf)

In [29]:
k = 5 # topic num, intutivly , it can be set as classes num of judge cases 
k

5

In [30]:
 
# LSI transformation 
lsi = models.LsiModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k) 
judge2count_bigram_corpus_lsi = lsi[judge2count_bigram_corpus_tfidf]

In [31]:
printitem(judge2count_bigram_corpus_lsi)
# need multi test ,so no need to save (just save the best)
# lsi.save('rawdata/judge2count_bigram_lsi.lsi')  

[(0, 0.22146715548926538), (1, 0.03865311045642781), (2, -0.046704539146942725), (3, -0.041426743172759184), (4, 0.032230612684484093)]
[(0, 0.13746201014796552), (1, -0.078181728096736181), (2, -0.027989818874599878), (3, -0.01490720464033355), (4, 0.012790288191600083)]
[(0, 0.40712544047145532), (1, 0.12003322051787807), (2, -0.083355146102800606), (3, -0.22069959828371735), (4, 0.049440065706590305)]


In [38]:
# Random Projections, RP similiar to LDA
rp = models.RpModel(judge2count_bigram_corpus_tfidf, id2word= judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_RP = rp[judge2count_bigram_corpus_tfidf]

In [1]:
judge2count_bigram_corpus

NameError: name 'judge2count_bigram_corpus' is not defined

In [35]:
# LDA topic model Very slow, not cross validation
LDA = models.LdaModel(judge2count_bigram_corpus_tfidf, id2word=judge_bigram_dict, num_topics=k)
judge2count_bigram_corpus_lda = LDA[judge2count_bigram_corpus_tfidf]

In [None]:
printitem(judge2count_bigram_corpus_lda)

### part 3. Train with new features

In [2]:
data = pd.read_csv('../Holger_train.csv',decimal=',',index_col=0)
test = pd.read_csv('../Holger_test.csv',decimal=',',index_col=0)
data.head()

Unnamed: 0,trial,monsex,newcit,nocounts,judgeid,malejudge,judge_yearsonbench,demean_logsenttot,year1,year2,...,cr13,cr14,cr15,cr16,cr17,cr18,cr19,cr20,cr21,cr22
199761,0.0,0.0,0.0,2.0,"MOLLOY, DONALD W.",1.0,15.0,1.2169876098632812,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199371,1.0,0.0,0.0,1.0,"SHANSTROM, JACK D.",1.0,20.0,-0.3454446792602539,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146029,0.0,0.0,0.0,2.0,"SCOTT, JEANNE E.",0.0,11.0,0.2828431129455566,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219420,0.0,0.0,0.0,3.0,"WHITE, RONALD A.",1.0,6.0,1.1730012893676758,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174779,0.0,0.0,0.0,1.0,"HOVLAND, DANIEL L.",1.0,7.0,0.1704235076904297,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [50]:
judgeid2row ={k: i for i,k in enumerate(judge2count_bigram.keys()) }

In [38]:
#judge2count_bigram_corpus_RP
#corpus = corpora.MmCorpus('../rawdata/judge2count_bigram_corpus.mm')
count_bigram_corpus_matrix = np.zeros([1329, k])
for i, j in enumerate(judge2count_bigram_corpus_lsi):
    item = [v[1] for v in j]
    count_bigram_corpus_matrix[i,:] = item


In [39]:
count_bigram_corpus_matrix[0,:]


array([ 0.22146716,  0.03865311, -0.04670454, -0.04142674,  0.03223061])

In [44]:
topic_featuresDF = pd.DataFrame(count_bigram_corpus_matrix, columns=['topic'+str(i) for i in range(k)])
topic_featuresDF['judgeid'] = np.array(list(judge2count_bigram.keys()))
Train_data = pd.merge(data, topic_featuresDF, how='inner', on='judgeid')
#print(Train_data.describe())

Test_data = pd.merge(test, topic_featuresDF, how='inner', on='judgeid')
#print(Test_data.describe())
Train_data.iloc[[1,3,1000]]

Unnamed: 0,trial,monsex,newcit,nocounts,judgeid,malejudge,judge_yearsonbench,demean_logsenttot,year1,year2,...,cr18,cr19,cr20,cr21,cr22,topic0,topic1,topic2,topic3,topic4
1,0.0,0.0,0.0,1.0,"MOLLOY, DONALD W.",1.0,12.0,0.4133563041687011,0,0,...,0.0,0.0,0.0,0.0,0.0,0.378476,0.051452,-0.081332,0.008431,0.03677
3,0.0,0.0,0.0,1.0,"MOLLOY, DONALD W.",1.0,13.0,1.4375112056732178,0,0,...,0.0,0.0,0.0,0.0,0.0,0.378476,0.051452,-0.081332,0.008431,0.03677
1000,0.0,0.0,1.0,1.0,"WHITE, RONALD A.",1.0,6.0,-0.4525229930877685,0,0,...,0.0,0.0,0.0,0.0,0.0,0.131729,-0.020603,-0.011237,-0.015328,-0.00584


In [46]:
X_train_ori = pd.concat([Train_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Train_data.loc[:,'year2':'cr21']], axis = 1,ignore_index=True)
X_train = pd.concat([Train_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Train_data.loc[:,'year2':'topic4']], axis = 1,ignore_index=True)
X_test =  pd.concat([Test_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Test_data.loc[:,'year2':'topic4']], axis = 1,ignore_index=True)
X_test_ori =  pd.concat([Test_data.loc[:, ['nocounts','malejudge','judge_yearsonbench']] ,
                    Test_data.loc[:,'year2':'cr21']], axis = 1,ignore_index=True)
y_train = Train_data.loc[:,'demean_logsenttot'].values.astype(np.float32)
y_test = Test_data.loc[:,'demean_logsenttot'].values.astype(np.float32)


In [47]:
regr_rf = RandomForestRegressor(max_depth=30)
regr_rf.fit(X_train, y_train)

lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

lr_ori = linear_model.LinearRegression()
lr_ori.fit(X_train_ori, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [48]:
y_pred = regr_rf.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_ori = lr_ori.predict(X_test_ori)
error = np.sum(np.square(y_pred - y_test))/len(y_test)
error_lr = np.sum(np.square(y_pred_lr - y_test))/len(y_test)
error_ori = np.sum(np.square(y_pred_ori - y_test))/len(y_test)
print(error, error_lr, error_ori)



1.23983248219 1.15705574256 1.15739065631


### Aux function

In [7]:
def printitem(corpus,num = 1):
    for i, j in enumerate(corpus):
        print(j)
        if i > num:
            break 