In [1]:
import tqdm, json, os, re, string, pprint, itertools, seaborn, collections, numpy as np, csv, requests
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

from sklearn.model_selection import GridSearchCV
import JudgmentsProcessor
import JudgmentsClassifier

In [2]:
def process_judgment(judgment):
    data_to_query = (",").join(judgment)
    r = requests.post(data=data_to_query.encode("utf-8"), url="http://localhost:9200")
    response_text = r.text
    splited_response = response_text.splitlines()
    splited_response = [" ".join(x.replace("\t", " ").replace("none", "")[1:].split(":")[:2][:1]).replace(" ", ":")
                        for x in splited_response if ":" in x]
    splited_response = [x.split(":")[0] for x in splited_response]
    return splited_response

judgments = []
signatures = []
judgment_succeeded = []

def read_file(file_path, is_transforming=False):
   
    print ("Processing file" + file_path)
    with open (file_path) as file:
        json_content = json.load(file)
        item_count = 0
        for item in tqdm.tqdm(json_content):
            judgment = []
            if item['courtType'] in ['COMMON', 'SUPREME']:
                courtCases = item['courtCases']
                signatures_for_judgment = []
                for courtCase in courtCases:
                    signature = courtCase['caseNumber']
                    signatures_for_judgment.append(signature)
                signatures.append(signatures_for_judgment)
                item_count += 1
                text_content = re.sub("<.*?>", "", item["textContent"])
                text_content = text_content.replace('-\n', '')
                word_content = text_content.split()
                topicSpecificPunctuation = '„”–§…«»'
                translator = str.maketrans('', '', string.punctuation+topicSpecificPunctuation)

                for word in word_content:
                    word = word.translate(translator).lower()
                    if len(word)>0:
                        judgment.append(word)
                if not is_transforming:
                    judgments.append(judgment)
                else:
                    processed = process_judgment(judgment)
                    judgments.append(processed)
                    judgment_succeeded.append(len(judgment) > 0)

In [3]:
def read_all_judgments_from_2018(is_transforming=False):
        for filename in os.listdir("../data_filtered_2/"):
            read_file("../data_filtered_2/" + filename, is_transforming)

In [4]:
read_all_judgments_from_2018()

100%|██████████| 9/9 [00:00<00:00, 228.57it/s]
 47%|████▋     | 47/100 [00:00<00:00, 436.68it/s]

Processing file../data_filtered_2/judgments-3163.json
Processing file../data_filtered_2/judgments-3168.json


100%|██████████| 100/100 [00:00<00:00, 354.72it/s]
 25%|██▌       | 25/100 [00:00<00:00, 244.52it/s]

Processing file../data_filtered_2/judgments-3164.json


100%|██████████| 100/100 [00:00<00:00, 278.24it/s]
 42%|████▏     | 42/100 [00:00<00:00, 397.03it/s]

Processing file../data_filtered_2/judgments-3171.json


100%|██████████| 100/100 [00:00<00:00, 346.76it/s]
 25%|██▌       | 25/100 [00:00<00:00, 247.14it/s]

Processing file../data_filtered_2/judgments-3165.json


100%|██████████| 100/100 [00:00<00:00, 254.97it/s]
 31%|███       | 31/100 [00:00<00:00, 301.98it/s]

Processing file../data_filtered_2/judgments-3167.json


100%|██████████| 100/100 [00:00<00:00, 363.80it/s]
 31%|███       | 31/100 [00:00<00:00, 297.00it/s]

Processing file../data_filtered_2/judgments-3169.json


100%|██████████| 100/100 [00:00<00:00, 288.34it/s]
 37%|███▋      | 30/81 [00:00<00:00, 297.83it/s]

Processing file../data_filtered_2/judgments-3173.json


100%|██████████| 81/81 [00:00<00:00, 323.96it/s]
 36%|███▌      | 36/100 [00:00<00:00, 355.72it/s]

Processing file../data_filtered_2/judgments-3172.json


100%|██████████| 100/100 [00:00<00:00, 388.11it/s]
 37%|███▋      | 37/100 [00:00<00:00, 362.83it/s]

Processing file../data_filtered_2/judgments-3166.json


100%|██████████| 100/100 [00:00<00:00, 333.45it/s]
 31%|███       | 31/100 [00:00<00:00, 285.67it/s]

Processing file../data_filtered_2/judgments-3170.json


100%|██████████| 100/100 [00:00<00:00, 293.81it/s]


In [5]:
lists = [False] * 775
lists.append(True)
lists.append(True)
judge_processor = JudgmentsProcessor.JudgmentsProcessor(judgments, signatures, 'filteredRank.csv', lists)

In [6]:
X, Y = judge_processor.process_judgments()

In [7]:
print (len(X))
print (len(Y))

777
777


In [8]:
judgment_classifier = JudgmentsClassifier.JudgmentsClassifier(X, Y)
result_dict, clf, hyper_params = judgment_classifier.transform_and_train_classifier()

  y = column_or_1d(y, warn=True)


{'accuracy_score': 0.93846153846153846, 'classification_report': '                                            precision    recall  f1-score   support\n\n                            sprawy cywilne       0.81      0.98      0.88        43\nsprawy z zakresu ubezpieczenia społecznego       1.00      0.99      0.99        68\n                              sprawy karne       1.00      0.97      0.99        34\n                        sprawy gospodarcze       0.93      0.67      0.78        21\n             sprawy w zakresie prawa pracy       1.00      0.87      0.93        15\n        sprawy w zakresie prawa rodzinnego       1.00      1.00      1.00         6\n                      sprawy o wykroczenia       0.89      1.00      0.94         8\n\n                               avg / total       0.95      0.94      0.94       195\n', 'micro_report': (0.93846153846153846, 0.93846153846153846, 0.93846153846153846, None), 'macro_report': (0.9471306471306471, 0.92370855318871736, 0.929914774673070

  .format(len(labels), len(target_names))


In [14]:
print (hyper_params)
print(result_dict['accuracy_score'])
print(result_dict['classification_report'])
print(result_dict['micro_report'])
print(result_dict['macro_report'])

{'C': 5000, 'gamma': 0.001, 'kernel': 'rbf'}
0.938461538462
                                            precision    recall  f1-score   support

                            sprawy cywilne       0.81      0.98      0.88        43
sprawy z zakresu ubezpieczenia społecznego       1.00      0.99      0.99        68
                              sprawy karne       1.00      0.97      0.99        34
                        sprawy gospodarcze       0.93      0.67      0.78        21
             sprawy w zakresie prawa pracy       1.00      0.87      0.93        15
        sprawy w zakresie prawa rodzinnego       1.00      1.00      1.00         6
                      sprawy o wykroczenia       0.89      1.00      0.94         8

                               avg / total       0.95      0.94      0.94       195

(0.93846153846153846, 0.93846153846153846, 0.93846153846153846, None)
(0.9471306471306471, 0.92370855318871736, 0.92991477467307082, None)


In [17]:
import pickle
with open('judgments.pickle', 'rb') as handle:
    judgments = pickle.load(handle)
with open('signatures.pickle', 'rb') as handle:
    signatures = pickle.load(handle)
    
judge_processor = JudgmentsProcessor.JudgmentsProcessor(judgments, signatures, 'filteredRank.csv', lists)

In [18]:
X, Y = judge_processor.process_judgments()

In [20]:
judgment_classifier = JudgmentsClassifier.JudgmentsClassifier(X, Y)
result_dict, clf, hyper_params = judgment_classifier.transform_and_train_classifier()

  y = column_or_1d(y, warn=True)
  .format(len(labels), len(target_names))


In [21]:
print (hyper_params)
print(result_dict['accuracy_score'])
print(result_dict['classification_report'])
print(result_dict['micro_report'])
print(result_dict['macro_report'])

{'C': 5000, 'gamma': 0.001, 'kernel': 'rbf'}
0.938775510204
                                            precision    recall  f1-score   support

                            sprawy cywilne       0.80      0.95      0.87        41
sprawy z zakresu ubezpieczenia społecznego       1.00      1.00      1.00        56
                              sprawy karne       1.00      0.97      0.99        35
                        sprawy gospodarcze       0.90      0.72      0.80        25
             sprawy w zakresie prawa pracy       1.00      0.89      0.94        19
        sprawy w zakresie prawa rodzinnego       1.00      1.00      1.00        11
                      sprawy o wykroczenia       1.00      1.00      1.00         9

                               avg / total       0.94      0.94      0.94       196

(0.93877551020408168, 0.93877551020408168, 0.93877551020408168, None)
(0.95655976676384846, 0.93391213224699376, 0.94237405106970318, None)


In [None]:
# TODO: Increase the range of hyper-parameters
# Remove from tagged dataset those where the tagger failed
# Test non-tagged and tagged on the very same dataset (including random_state!)