## Sample Verdicts Analysis
##### Version: 1.0
##### Date: 06/12/2019

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re
import torch
import difflib
import thulac
import time
import threading
import queue
import json
from copy import copy
from fuzzywuzzy import fuzz
import mysql.connector
from mysql.connector import Error



#### Load data from mySql database

In [2]:
try:
    connection = mysql.connector.connect(host ='localhost', database = 'Judgements', user = 'root', password = '123456')
    # modify your feeding dataset by modifying the sql query below
    sql_select_query = "SELECT * FROM `2015n` WHERE `案件类型`= 2 AND `审判程序`='一审' LIMIT 100000;"
    cursor = connection.cursor()
    cursor.execute(sql_select_query)
    records = cursor.fetchall()
    print("Total number of rows is ", cursor.rowcount)
    dataframe = pd.DataFrame(records)
except Error as e:
    print("Error reading data from MySQL table", e)
finally:
    if (connection.is_connected()):
        connection.close()
        cursor.close()
        print("MySQL connection is closed.")

We take 判决书 as an example

In [3]:
for i in range(len(dataframe['正文'])):
    if type(dataframe['正文'].iloc[i]) != type('test'):
        dataframe['正文'].iloc[i] = '原始数据缺乏'

doc_type_list = [u'判决书', u'裁定书', u'调解书', u'决定书', u'通知书', u'令']
def check_type(input_title):
    for i in range(len(doc_type_list)):
        if doc_type_list[i] in input_title:
            return int(i)

dataframe[u'文书类型'] = dataframe[u'案件名称'].apply(lambda x: check_type(x))
df = dataframe[dataframe[u'文书类型'] == 0.0]
df = df.drop([u'year'], axis=1)
df = df.drop([u'文书类型'], axis = 1)
df = df.drop([u'公诉机关'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


#### Load Courts Information

In [4]:
with open('resource/primary_courts.csv') as datafile:
    data = pd.read_csv(datafile, encoding = 'utf-8')
primary = pd.DataFrame(data)

with open('resource/middle_courts.csv') as datafile:
    data = pd.read_csv(datafile, encoding = 'utf-8')
middle = pd.DataFrame(data)

with open('resource/supreme_courts.csv') as datafile:
    data = pd.read_csv(datafile, encoding = 'utf-8')
supreme = pd.DataFrame(data)

supreme = supreme.drop('Unnamed: 0', axis = 1)
middle = middle.drop('Unnamed: 0', axis = 1)
primary = primary.drop('Unnamed: 0', axis = 1)

#### Load Region Code

In [5]:
with open('resource/region_code.csv') as datafile:
    data = pd.read_csv(datafile, encoding='utf-8')
region_code = pd.DataFrame(data)
region_code = region_code.drop('Unnamed: 0', axis=1)

<br/><br/>

#### Parties Extraction

In [6]:
def Parties_Extraction(x, party):
    if(party== '公诉人'):
        res_GSJG = re.search(r'.*法院', x)
        if(res_GSJG==None):
            return '抱歉，无法查找'
        else:
            return res_GSJG.group().replace('法院','检察院',1)

    if (party == '原告'):
        res_YG = re.search(r'原告.*?。|自诉人.*?。|申请执行人.*?。|申请人.*?。|执行人.*?。|审原告.*。|起诉人.*?。|原告:.*?。|上诉人.*?。|执行人.*。', x)
        if (res_YG == None):
            return '抱歉，无法识别原告信息或不存在原告'
        else:
            return res_YG.group()
  
    if (party == '被告'):
        res_BG=re.findall(r'被告人.*?。|被告.*?。|犯罪嫌疑人.*?。|罪犯.*?。|嫌疑人.*?。|被申请人.*?。|被执行人.*?。|审被告.*?。|被告:.*?。|被申请执行人.*?。|被拘留人.*?。|被上诉人.*?。|.*被执行人.*?。', x)
        if (len(res_BG) == 0):
            return '抱歉，无法识别被告信息'
        elif(len(res_BG)>1):
            #识别是否多个匹配到的被告都是同一个人
            t_0 = re.search(r'被告人|被告|犯罪嫌疑人|罪犯|嫌疑人|被申请人|被执行人|审被告|被告:|被申请执行人|被拘留人|被上诉人|审被告人', res_BG[0]).group()
            t_1 = re.search(r'被告人|被告|犯罪嫌疑人|罪犯|嫌疑人|被申请人|被执行人|审被告|被告:|被申请执行人|被拘留人|被上诉人|审被告人', res_BG[1]).group()
            tt_0= res_BG[0][len(t_0):len(t_0) + 2]
            tt_1= res_BG[1][len(t_1):len(t_1) + 2] #用名字or称呼的前两个字来区分是否是同一目标
            if(tt_0==tt_1):
                return res_BG[0]
            else:
                return " ".join(str(i) for i in res_BG)
        else:
            return res_BG[0]

    if (party == '律师'):
        res_LS=re.search(r'辩护律师.*。|辩护人.*。|代理人.*。|诉讼代理人.*。|.*律师|代理律师.*|律师.*', x)
        if (res_LS == None):
            return '抱歉，无法识别律师信息或案件未含有律师'
        else:
            return res_LS.group()

    if (party == '委托人'):
        res_WT=re.search(r'.*委托.*。|委托人.*|委托.*', x)
        if (res_WT == None):
            return '抱歉，无法识别委托人信息或案件未含有委托人'
        else:
            return res_WT.group()

    if (party == '审判人员'):
        res_SP=re.findall(r'审判员.*|陪审员.*|书记员.*', x)
        if (res_SP == None):
            return '抱歉，无法查找'
        else:
            return " ".join(str(i) for i in res_SP)


def MoreinExtraction(df):
    #df['公诉人']=df['正文'].apply(lambda x: Parties_Extraction(x,'公诉人'))
    df['原告']=df['正文'].apply(lambda x: Parties_Extraction(x,'原告'))
    df['被告']=df['正文'].apply(lambda x: Parties_Extraction(x,'被告'))
    df['律师']=df['正文'].apply(lambda x: Parties_Extraction(x,'律师'))
    df['委托人']=df['正文'].apply(lambda x: Parties_Extraction(x,'委托人'))
    #df['审判人员']=df['正文'].apply(lambda x: Parties_Extraction(x,'审判人员'))
    
    return df

In [7]:
def divide_dataframe(dataframe):
    return np.array_split(dataframe, 4)

In [8]:
class extract_parties_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        #print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        print(threading.activeCount())
        MoreinExtraction(self.df)
        # Free lock to release next thread
        threadLock.release()
        
threadLock = threading.Lock()
threads = []

In [9]:
input_dataframe_list = divide_dataframe(df)
try:
    thread1 = extract_parties_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = extract_parties_threads(2, "thread-2", input_dataframe_list[1])
    thread3 = extract_parties_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = extract_parties_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

6
8
7
6
Exiting Main Thread


<br/><br/>
#### Courts Stuff Extraction

In [10]:
model1 = thulac.thulac()

Model loaded succeed


##### Prime Judge (审判长)

In [11]:
def find_pj(input_string, prime_judge_list):
    prime_judge_list.append(re.findall(r'审判长.*|审 判 长.*', input_string))

    
def extract_prime_judge(list1):
    prime_judge_list = []
    list1['正文'].apply(lambda x: find_pj(x, prime_judge_list))
    # Extract Prime Judge Information
    for i in range(len(prime_judge_list)):
        if len(prime_judge_list[i]) == 2:
            w_wo_blank = prime_judge_list[i][1].replace(' ', '').replace('，', '').replace('：','')
        elif len(prime_judge_list[i]) == 1:
            w_wo_blank = prime_judge_list[i][0].replace(' ', '').replace('，', '').replace('：','')
        else:
            list1['审判长'].iloc[i] = '无有效信息'
            continue
        mark = w_wo_blank.find('审判长') + 3
        pre_seg = w_wo_blank[mark:len(w_wo_blank)-1]
        if len(pre_seg) > 4:
            post_seg_list = model1.cut(pre_seg, text=False)
            for j in post_seg_list:
                if j[1] == 'np':
                    pre_seg = j[0]


        list1['审判长'].iloc[i] = pre_seg
    return list1

In [12]:
class extract_pj_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        extract_prime_judge(self.df)
        # Free lock to release next thread
        threadLock.release()
    
threadLock = threading.Lock()
threads = []

In [13]:
df['审判长'] = 'default value'
input_dataframe_list = divide_dataframe(df)

try:
    thread1 = extract_pj_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = extract_pj_threads(2, "thread-2", input_dataframe_list[1]) 
    thread3 = extract_pj_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = extract_pj_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

Starting thread-1
Starting thread-2Starting thread-3

Starting thread-4
Exiting Main Thread


##### Judge(审判员)

In [14]:
def extract_judge_helper(input_string):
    mark = input_string.find('审判员') + 3
    pre_seg = input_string[mark:]
    if len(pre_seg) > 4:
        post_seg_list = model1.cut(pre_seg, text=False)
        for j in post_seg_list:
            if j[1] == 'np':
                pre_seg = j[0]
    return pre_seg

def find_judge(input_string, judge_list):
    judge_list.append(re.findall(r'审判员.*|审 判 员.*', input_string))
    
def extract_judges(list1):
    judge_list = []
    list1['正文'].apply(lambda x: find_judge(x, judge_list))
    for i in range(len(judge_list)):
        if len(judge_list[i]) == 2 and len(judge_list[i][0]) > 15:
            w_wo_blank = judge_list[i][1].replace(' ', '').replace('，', '').replace('：','').replace('\\', '')
            judge_result = [extract_judge_helper(w_wo_blank)]
        elif len(judge_list[i]) == 1:
            w_wo_blank = judge_list[i][0].replace(' ', '').replace('，', '').replace('：','').replace('\\', '')
            judge_result = [extract_judge_helper(w_wo_blank)]
        elif len(judge_list[i]) == 2 and len(judge_list[i][0]) <= 15:
            w_wo_blank = [judge_list[i][0].replace(' ', '').replace('，', '').replace('：',''), judge_list[i][1].replace(' ', '').replace('，', '').replace('：','')]
            judge_result = []
            judge_result.append(extract_judge_helper(w_wo_blank[0]))
            judge_result.append(extract_judge_helper(w_wo_blank[1]))
        else:
            list1['审判员'].iloc[i] = '无有效信息'
            continue
        list1['审判员'].iloc[i] = judge_result
    return list1

In [15]:
class extract_judges_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        extract_judges(self.df)
        # Free lock to release next thread
        threadLock.release()
    
threadLock = threading.Lock()
threads = []

In [16]:
df['审判员'] = 'default value'
input_dataframe_list = divide_dataframe(df)

try:
    thread1 = extract_judges_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = extract_judges_threads(2, "thread-2", input_dataframe_list[1]) 
    thread3 = extract_judges_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = extract_judges_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

Starting thread-1Starting thread-2

Starting thread-3
Starting thread-4
Exiting Main Thread


##### Clerk(书记员)

In [17]:
def extract_clerk_helper(input_string):
    mark = input_string.find('书记员') + 3
    pre_seg = input_string[mark:]
    if len(pre_seg) > 4:
        post_seg_list = model1.cut(pre_seg, text=False)
        for j in post_seg_list:
            if j[1] == 'np':
                pre_seg = j[0]
    return pre_seg

def find_clerk(input_string, clerk_list):
    clerk_list.append(re.findall(r'书记员.*|书 记 员.*', input_string))

def extract_clerk(list1):
    clerk_list = []
    list1['正文'].apply(lambda x: find_clerk(x, clerk_list))
    for i in range(len(clerk_list)):
        if len(clerk_list[i]) == 2 and len(clerk_list[i][0]) > 15:
            w_wo_blank = clerk_list[i][1].replace(' ', '').replace('，', '').replace('：','').replace('\\', '')
            judge_result = [extract_clerk_helper(w_wo_blank)]
        elif len(clerk_list[i]) == 1:
            w_wo_blank = clerk_list[i][0].replace(' ', '').replace('，', '').replace('：','').replace('\\', '')
            judge_result = [extract_clerk_helper(w_wo_blank)]
        elif len(clerk_list[i]) == 2 and len(clerk_list[i][0]) <= 15:
            w_wo_blank = [clerk_list[i][0].replace(' ', '').replace('，', '').replace('：',''), clerk_list[i][1].replace(' ', '').replace('，', '').replace('：','')]
            clerk_result = []
            clerk_result.append(extract_clerk_helper(w_wo_blank[0]))
            clerk_result.append(extract_clerk_helper(w_wo_blank[1]))
        else:
            list1['书记员'].iloc[i] = '无有效信息'
            continue
        list1['书记员'].iloc[i] = judge_result
    return list1

In [18]:
class extract_clerks_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        extract_clerk(self.df)
        # Free lock to release next thread
        threadLock.release()
    
threadLock = threading.Lock()
threads = []

In [19]:
df['书记员'] = 'default value'
input_dataframe_list = divide_dataframe(df)

try:
    thread1 = extract_clerks_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = extract_clerks_threads(2, "thread-2", input_dataframe_list[1]) 
    thread3 = extract_clerks_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = extract_clerks_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

Starting thread-1
Starting thread-2
Starting thread-3
Starting thread-4
Exiting Main Thread


##### Jury(陪审员)

In [20]:
def extract_jury_helper(input_string):
    mark = input_string.find('陪审员') + 3
    pre_seg = input_string[mark:]
    if len(pre_seg) > 4:
        post_seg_list = model1.cut(pre_seg, text=False)
        for j in post_seg_list:
            if j[1] == 'np':
                pre_seg = j[0]
    return pre_seg

def find_jury(input_string, jury_list):
    jury_list.append(re.findall(r'陪审员.*|陪 审 员.*', input_string))

def extract_jury(list1):
    jury_list = []
    list1['正文'].apply(lambda x: find_jury(x, jury_list))
    for i in range(len(jury_list)):
        if len(jury_list[i]) == 0:
            list1['陪审员'].iloc[i] = '无有效信息'
        elif len(jury_list[i]) == 1 and len(jury_list[i][0]) > 15:
            result_list = model1.cut(jury_list[i][0], text=False)
            for j in result_list:
                if j[1] == 'np':
                    list1['陪审员'].iloc[i] = [j[0]]
                    break
        else:
            jury_result = []
            for j in jury_list[i]:
                wwb = j.replace(' ', '').replace('：', '').replace('\\', '')
                jury_result.append(extract_jury_helper(wwb))
            list1['陪审员'].iloc[i] = jury_result
    return list1

In [21]:
class extract_juries_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        extract_jury(self.df)
        # Free lock to release next thread
        threadLock.release()
    
threadLock = threading.Lock()
threads = []

In [22]:
df['陪审员'] = 'default value'
input_dataframe_list = divide_dataframe(df)

try:
    thread1 = extract_juries_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = extract_juries_threads(2, "thread-2", input_dataframe_list[1]) 
    thread3 = extract_juries_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = extract_juries_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

Starting thread-1
Starting thread-2
Starting thread-3
Starting thread-4
Exiting Main Thread


<br/><br/>
#### Courts Lists

In [23]:
courts_list = primary.append(middle, ignore_index = True).append(supreme, ignore_index = True)

In [24]:
len(courts_list)

3530

In [25]:
def court_match(courts):
    for i in range(len(courts)):
        for j in range(len(courts_list)):
            if fuzz.ratio(courts['审理法院'].iloc[i], courts_list['法院名称'].iloc[j]) > 75:
                courts['上诉法院'].iloc[i] = courts_list['上诉法院'].iloc[j]
                courts['法院等级'].iloc[i] = courts_list['级别'].iloc[j]
                break

In [26]:
class court_match_threads(threading.Thread):
    def __init__(self, threadID, name, df):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.df = df
    
    def run(self):
        print ("Starting " + self.name)
        # Get lock to synchronize threads
        threadLock.acquire()
        court_match(self.df)
        # Free lock to release next thread
        threadLock.release()
    
threadLock = threading.Lock()
threads = []

In [27]:
df['上诉法院'] = 'default value'
df['法院等级'] = 'default value'
input_dataframe_list = divide_dataframe(df)

try:
    thread1 = court_match_threads(1, "thread-1", input_dataframe_list[0])
    thread2 = court_match_threads(2, "thread-2", input_dataframe_list[1]) 
    thread3 = court_match_threads(3, "thread-3", input_dataframe_list[2])
    thread4 = court_match_threads(4, "thread-4", input_dataframe_list[3])
    
    # Start new Threads
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()

    # Add threads to thread list
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)
    threads.append(thread4)
    
    # Wait for all threads to complete
    for t in threads:
        t.join()
    print ("Exiting Main Thread")
except:
    print("Error: unable to start thread")
finally:
    df = input_dataframe_list[0].append(input_dataframe_list[1]).append(input_dataframe_list[2]).append(input_dataframe_list[3])

Starting thread-1
Starting thread-2
Starting thread-3
Starting thread-4
Exiting Main Thread


Follow up work: 匹配案号和地区代码