# Second attempt at data-cleaning

In [135]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
import re

> warning, note I changed the name of the file, code wont work if the names do not match

In [136]:
df = pd.read_csv("Preliminary_database.csv")
df.dropna(subset=["Text"],inplace=True)
df.shape

(510, 14)

In [137]:
df.columns.tolist()

['Unnamed: 0',
 'Title',
 'CELEX number',
 'ECLI identifier',
 'Transposed legal act(s)',
 'Type of procedure',
 'National court',
 'Country or organisation from which the request originates',
 'Date of document',
 'Author',
 'Document year',
 'Applicant/Appellant',
 'Defendant/Other parties to the proceedings',
 'Text']

In [138]:
df['date'] = pd.to_datetime(df['Date of document'])
df['year'] = pd.DatetimeIndex(df['date']).year
print(type(df['date'][8]),
      type(df['year'][8]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'> <class 'numpy.int64'>


In [139]:
df.drop(['Date of document','Document year'], axis=1, inplace=True)

## Renaming long names as a quality of life fix

In [140]:
df.rename({'Country or organisation from which the request originates':'origin',
           'Defendant/Other parties to the proceedings':'parties'},
          axis=1,inplace=True)

In [141]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

## Dropping useless columns as a quality of life fix

In [142]:
# this is pure nan, just delete
df['Transposed legal act(s)'].head()
df.drop('Transposed legal act(s)', axis=1, inplace=True)

In [143]:
# This is not useful info. I suggest drop or ignore
df['National court'].value_counts()
df.drop('National court', axis=1, inplace=True)

## This is going to be a nightmare to standarize ->

In [144]:
df['Type of procedure'].value_counts()


Reference for a preliminary ruling                                                                              131
Action for annulment - unfounded                                                                                 62
Action for annulment - successful                                                                                21
Action for annulment - successful, Action for annulment - unfounded                                              11
Action for annulment, Appeal - unfounded                                                                         11
                                                                                                               ... 
Appeal - successful, Action for damages, Staff cases                                                              1
Appeal against penalty, Appeal - unfounded, Action for annulment, Appeal - inadmissible                           1
Action for damages - unfounded, Action for damages - successful, Action 

In [145]:
df['origin'].value_counts()

Third countries                                             64
Germany, Germany                                            54
Italy, Italy                                                38
France, France                                              35
Spain, Spain                                                33
                                                            ..
Luxembourg, Italy, Italy, Luxembourg                         1
Belgium, Italy, Italy, Belgium                               1
United Kingdom, Netherlands, Netherlands, United Kingdom     1
Luxembourg, Third countries, Luxembourg                      1
Luxembourg                                                   1
Name: origin, Length: 74, dtype: int64

------

## Condensing author info -- this part works

In [146]:
# The details are useless, but we can use the general Court and CST vs domestic referrals
df['Author'].value_counts()

General Court                               216
Civil Service Tribunal                       31
Bot, Court of Justice                        21
Court of Justice, Bot                        21
Court of Justice, Kokott                     20
Wathelet, Court of Justice                   16
Sharpston, Court of Justice                  16
Court of Justice, Mengozzi                   16
Cruz Villalón, Court of Justice              15
Court of Justice, Wahl                       13
Court of Justice, Jääskinen                  12
Jääskinen, Court of Justice                  12
Kokott, Court of Justice                     12
Court of Justice, Sharpston                  11
Court of Justice, Wathelet                    8
Mengozzi, Court of Justice                    7
Wahl, Court of Justice                        7
Court of Justice, Cruz Villalón               7
Court of Justice, Szpunar                     6
Court of Justice, Tanchev                     6
Szpunar, Court of Justice               

In [147]:
def author_standarizer(a_string):
    if a_string == "General Court":
        return a_string
    elif a_string == "Civil Service Tribunal":
        return a_string
    else:
        a_string = "domestic referral"
        return a_string
    

In [148]:
df['author2'] = df.Author.apply(author_standarizer)
df.author2.value_counts()

domestic referral         263
General Court             216
Civil Service Tribunal     31
Name: author2, dtype: int64

----
## Approaching the mess on types of procedures

In [149]:
## as we can see, this is comma separated, so the first thing we want to do is to split into a list
df['Type of procedure'].value_counts()


Reference for a preliminary ruling                                                                              131
Action for annulment - unfounded                                                                                 62
Action for annulment - successful                                                                                21
Action for annulment - successful, Action for annulment - unfounded                                              11
Action for annulment, Appeal - unfounded                                                                         11
                                                                                                               ... 
Appeal - successful, Action for damages, Staff cases                                                              1
Appeal against penalty, Appeal - unfounded, Action for annulment, Appeal - inadmissible                           1
Action for damages - unfounded, Action for damages - successful, Action 

In [150]:
# since it is a simple function that we will only use once, a lambda looks nice
df['Type of procedure'] = df['Type of procedure'].apply(lambda x: x.split(","))

In [151]:
df3 = df.explode('Type of procedure')
## now we can see what is going on, it is a horrible mess
df3['Type of procedure'].value_counts()

Reference for a preliminary ruling                  137
Action for annulment - unfounded                     95
 Action for annulment - unfounded                    54
 Appeal - unfounded                                  53
 Action for annulment                                51
                                                   ... 
Action for failure to act - successful                1
Staff cases - decision unnecessary                    1
 Action for failure to act - inadmissible             1
Appeal                                                1
 Application for measures of inquiry - unfounded      1
Name: Type of procedure, Length: 61, dtype: int64

>a strategy here is to focus on the regularities, for example unfounded, succesful etc, here you can follow the logic of the trimming

In [154]:
def type_standarizer1(a_string):
    if "unfounded" in a_string:
        return "unfounded"
    elif "successful" in a_string:
        return "succesful"
    elif "inadmissible" in a_string:
        return "inadmissible"
    else:
        return a_string

In [155]:
df3['outcome1'] = df3['Type of procedure'].apply(type_standarizer1)
df3.outcome1.value_counts()

unfounded                                                     323
succesful                                                     172
Reference for a preliminary ruling                            137
inadmissible                                                   94
 Action for annulment                                          51
Action for annulment                                           40
 Appeal against penalty                                        24
 Action for damages                                            15
 Reference for a preliminary ruling                            10
Preliminary reference - urgent procedure                       10
Appeal against penalty                                          9
 Reference for a preliminary ruling - decision unnecessary      6
 Staff cases                                                    6
Staff cases                                                     6
 Appeal                                                         5
Judgment d

> so lets keep trimming...

In [156]:
def type_standarizer2(a_string):
    if "unfounded" in a_string:
        return "unfounded"
    elif "successful" in a_string:
        return "succesful"
    elif "inadmissible" in a_string:
        return "inadmissible"
    elif "unnecessary" in a_string:
        return "unnecessary"
    elif "Staff cases" in a_string:
        return "staff cases"
    elif "Action for annulment" in a_string:
        return "action for annulment"
    elif "Action for damages" in a_string:
        return "action for damages"
    else:
        return a_string    
    

In [157]:
df3['outcome2'] = df3['Type of procedure'].apply(type_standarizer2)
df3.outcome2.value_counts()

unfounded                                   323
succesful                                   172
Reference for a preliminary ruling          137
inadmissible                                 94
action for annulment                         91
 Appeal against penalty                      24
action for damages                           20
unnecessary                                  15
staff cases                                  12
Preliminary reference - urgent procedure     10
 Reference for a preliminary ruling          10
Appeal against penalty                        9
Judgment delivered after back reference       5
 Appeal                                       5
 Judgment delivered after back reference      2
Appeal                                        1
Arbitration clause                            1
Name: outcome2, dtype: int64

> We might want to drop what we dont need


> we will need an opposite operation to explode to re-collapse outcome2 into a list, looks tricky https://stackoverflow.com/questions/64235312/how-to-implodereverse-of-pandas-explode-based-on-a-column