# Clean Attributes

In [1]:
#Imports
import numpy as np
import pandas as pd

import ast
import json

import functions 
clean = functions.clean()

In [2]:
path = "/Users/conorosully/Documents/Legal-Case-Prediction/data/preclean/id_form.json"
with open(path, 'r') as outfile:
    idForm = json.load(outfile)
    
idForm["HUDOC-ECHR-1999-001-58225"]

['<h3>procedure</h3>',
 '<h3>the facts</h3>',
 '<p>i.the circumstances of the case</p>',
 '<p>ii.relevant domestic law and practice</p>',
 '<h3>the law</h3>',
 '<h3>for these reasons the court</h3>']

In [5]:
# Get list of case ids
case_id = list(idForm.keys())
case_id[0]

'HUDOC-ECHR-1999-001-58225'

In [6]:
att = pd.read_csv(
    "/Users/conorosully/Documents/Legal-Case-Prediction/data/preclean/case_attributes.csv"
    ,'#')
att = att[att['id'].isin(case_id)]

print(len(att))
print(att.columns)
att.head(1)


8703
Index(['id', 'type', 'url', 'doc-name', 'doc-id-label', 'doc-date', 'vl-type',
       'created-at', 'updated-at', 'app-no-parts', 'app-nos', 'articles',
       'conclusions', 'courts', 'ecli', 'judgement-date', 'judges',
       'last-modified-time', 'nonviolations', 'organisations', 'respondents',
       'separate-opinion', 'violations'],
      dtype='object')


Unnamed: 0,id,type,url,doc-name,doc-id-label,doc-date,vl-type,created-at,updated-at,app-no-parts,...,courts,ecli,judgement-date,judges,last-modified-time,nonviolations,organisations,respondents,separate-opinion,violations
10769,HUDOC-ECHR-1999-001-58225,eu-chrs,http://hudoc.echr.coe.int/app/conversion/docx/...,musial <br>v.<br> poland,24557/94,1999-03-25,eu_chr,2017-09-18T16:11:04.234Z,2018-10-26T22:51:32.107Z,"['24557', '94']",...,['Supreme Court'],ECLI:CE:ECHR:1999:0325JUD002455794,1999-03-25,"['Luzius Wildhaber', 'Nicolas Bratza']",2018-10-17,,['ECHR'],['POL'],True,"['5', '5-4']"


In [7]:
#Find list of unique article codes

articles = [item for sublist in list(att['articles'])  for item in ast.literal_eval(sublist)]
#ast.literal_eval(list(att['articles'])[0])
unique_articles = list(pd.Series(articles).unique())
unique_articles

['5',
 '5-4',
 '41',
 '6',
 '6-1',
 '6-3-a',
 '6-3-b',
 '6-1+6-3-a',
 '6-1+6-3-b',
 'P1-1',
 'P1-1-1',
 '13',
 '8',
 '35',
 '35-1',
 '5-3',
 '1',
 '2',
 '2-1',
 '2-2',
 '10',
 '10-1',
 '10-2',
 '11',
 '11-1',
 '11-2',
 '14+10',
 '14',
 '14+11',
 '37',
 '37-1',
 '37-1-b',
 '39',
 '9',
 '7',
 '3',
 '6-2',
 '7-1',
 '5-1',
 '34',
 '38',
 '18',
 '14+2',
 'P1-1-2',
 '6-3-d',
 '35-3',
 '8-1',
 '8-2',
 '14+8',
 '9-1',
 '9-2',
 '5-1-a',
 '19',
 '6-3-c',
 '6-3',
 '6-1+6-3-c',
 '14+P1-3',
 'P1-3',
 '56',
 '56-1',
 '56-3',
 'P1-4',
 '5-5',
 '14+P1-1',
 '35-3-a',
 '14+5-3',
 '37-1-c',
 '5-1-e',
 'P4-2',
 'P4-2-1',
 'P4-2-3',
 '37-1-a',
 '14+9',
 '5-2',
 '5-1-c',
 '57',
 '14+6',
 '38-1-a',
 '6+6-3-a',
 '6+6-3-b',
 '8+P1-1',
 '36',
 '36-1',
 '36-2',
 'P1-2',
 '6+6-3-d',
 '5-1-f',
 '14+7',
 '30',
 'P4-3',
 'P7-4',
 '43',
 '6-3-c+6-1',
 '6-1+6-3-d',
 'P4-4',
 '13+3',
 '13+P4-4',
 '13+6-1',
 '14+6-1',
 '5-1-d',
 '14+5-1',
 '29',
 '29-3',
 '12',
 'P7-2',
 '6+6-3-e',
 '6-3-e',
 '35-4',
 '5-1-b',
 'P4-2-2'

In [8]:
#Get list of unique articles 
n = 13
articles = att.iloc[n]['articles']
violations = att.iloc[n]['violations']
nonviolations = att.iloc[n]['nonviolations']

print(articles)
print(violations)
print(nonviolations)

print(clean.getUniqueArticles(articles))
print(clean.getUniqueArticles(violations))
print(clean.getUniqueArticles(nonviolations))

['1', '2', '2-1', '13', '34', '35', '35-1', '38', '41', '14+2', '14']
['2', '13']
['2', '14+2', '14']
{'2', '35', '1', '38', '41', '14', '34', '13'}
{'2', '13'}
{'2', '14'}


In [9]:
LIST = ["2","3","5","6","7","8","9","10","11","13","14",
        "18","19","34","35","37","41","46",
        'P1', 'P4', 'P12', 'P7']

In [10]:
columns = ['id','date'] + LIST
target = pd.DataFrame(columns=columns)

for n in range(len(att)):
    ID = att.iloc[n]['id']
    date = att.iloc[n]['doc-date']
    row = [ID,date]
    
    articles = att.iloc[n]['articles']
    violations = att.iloc[n]['violations']
    nonviolations = att.iloc[n]['nonviolations']
    
    vector = clean.articleVector(articles,violations,nonviolations)
    row = row + vector

    target.loc[n] = row
     
target.head()

Unnamed: 0,id,date,2,3,5,6,7,8,9,10,...,19,34,35,37,41,46,P1,P4,P12,P7
0,HUDOC-ECHR-1999-001-58225,1999-03-25,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
1,HUDOC-ECHR-1999-001-58226,1999-03-25,-1,-1,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
2,HUDOC-ECHR-1999-001-58227,1999-03-25,-1,-1,-1,2,-1,2,-1,-1,...,-1,-1,2,-1,2,-1,2,-1,-1,-1
3,HUDOC-ECHR-1999-001-58239,1999-04-29,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1
4,HUDOC-ECHR-1999-001-58251,1999-05-20,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1


In [11]:
target.to_csv('/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/target.csv'
              ,index=False)


In [12]:
#Check target 
target = pd.read_csv('/Users/conorosully/Documents/Legal-Case-Prediction/data/clean/target.csv')
print(len(target))
target.head()

8703


Unnamed: 0,id,date,2,3,5,6,7,8,9,10,...,19,34,35,37,41,46,P1,P4,P12,P7
0,HUDOC-ECHR-1999-001-58225,1999-03-25,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
1,HUDOC-ECHR-1999-001-58226,1999-03-25,-1,-1,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,2,-1,-1,-1,-1,-1
2,HUDOC-ECHR-1999-001-58227,1999-03-25,-1,-1,-1,2,-1,2,-1,-1,...,-1,-1,2,-1,2,-1,2,-1,-1,-1
3,HUDOC-ECHR-1999-001-58239,1999-04-29,-1,-1,0,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1
4,HUDOC-ECHR-1999-001-58251,1999-05-20,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,2,-1,2,-1,-1,-1,-1,-1


In [1]:
sum(target['6'].isin([0]))

NameError: name 'target' is not defined