In [146]:
import pandas as pd
import numpy as np

In [246]:
df = pd.read_excel('criminal_df_v1.xlsx')

# 2700 criminal case judgements in South Africa between 2008 and 2018

In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2723 entries, 0 to 2722
Data columns (total 12 columns):
Applicant         2722 non-null object
Defendant         2720 non-null object
Case No           2723 non-null object
Judges            2723 non-null object
Summary           2723 non-null object
Date Heard        2072 non-null object
Date Judgement    2318 non-null object
Court             2723 non-null object
Case Category     2723 non-null object
Province          2723 non-null object
Related Crimes    1470 non-null object
Charge            2723 non-null object
dtypes: object(12)
memory usage: 255.4+ KB


In [249]:
df.drop_duplicates().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2723 entries, 0 to 2722
Data columns (total 12 columns):
Applicant         2722 non-null object
Defendant         2720 non-null object
Case No           2723 non-null object
Judges            2723 non-null object
Summary           2723 non-null object
Date Heard        2072 non-null object
Date Judgement    2318 non-null object
Court             2723 non-null object
Case Category     2723 non-null object
Province          2723 non-null object
Related Crimes    1470 non-null object
Charge            2723 non-null object
dtypes: object(12)
memory usage: 276.6+ KB


In [149]:
# to replace '\\'96'
# to add bail, compensation amounts

In [150]:
df['Charge'] = df['Charge'].replace('theft', 'robbery')
df['Charge'] = df['Charge'].replace('prescribed sentences', 'prescribed sentence')
df['Charge'] = df['Charge'].replace('prescribed minimum sentences', 'prescribed sentence')
df['Charge'] = df['Charge'].replace('housebreaking with intent to steal and theft', 'housebreaking')

In [151]:
df['Charge'].value_counts()[:10]

evidence               396
rape                   275
arrest                 196
murder                 177
appeal                 171
review                 127
prescribed sentence    118
robbery                110
bail                    91
damages                 56
Name: Charge, dtype: int64

In [152]:
# lack of information

# remit
# dismiss
# inadmissible, common law position restored
# set aside
# appeal granted, appeal upheld

# life sentence (25 years) and amount
# prison sentence and amount
# bail and amount

In [167]:
from textblob import TextBlob
import nltk
lemma = nltk.wordnet.WordNetLemmatizer()

def lemma_words(text):
    
    # returns clean text with infinitive verbs

    blob = TextBlob(text)
    blob_words = list(blob.words)
    
    blob_lst = []

    for word in blob_words:
        
        new_word = lemma.lemmatize(word, 'v')

        blob_lst.append(new_word)
        
    return ' '.join(blob_lst)

In [154]:
def get_sentence_years(blob):

    sentence_lst = []

    # extract ngrams that include keywords like sentence, imprisonment and not old and age

    for lst in blob.ngrams(3):
        if 'years' in lst and ('old' not in lst and 'age' not in lst):
            sentence_lst.append(list(lst))

    concatted = []

    for item in sentence_lst:     
        concatted.append(' '.join(item))

    concatted = ' '.join(concatted)
    sentence_phrase = list(dict.fromkeys(concatted.split()))

    # append ints that are mentioned in sentence_phrase 

    amount = []

    for word in sentence_phrase:
        try:
            amount.append(int(word))
        except ValueError:
            pass
    return amount

In [155]:
def get_verdict(text):
    
    text = text.replace('life sentence', '25 years sentence')
    text = text.replace('sentenced to life', '25 years sentence')
    text = text.replace('life imprisonment', '25 years sentence')
    text = text.replace('life in prison', '25 years sentence')
    
    text = lemma_words(text)
    
    blob = TextBlob(text)
    
    blob_lst = blob.ngrams(4)
    
    try:
        
        amount = get_sentence_years(blob)
        
        for item in blob_lst:

            if 'remit' in item:
                verdict = 'Remitted'

            elif 'inadmissible' in item or ('law' in item and 'position' in item and 'restored' in item):
                verdict = 'Inadmissible'

            elif 'set' in item and 'aside' in item:
                verdict = 'Set Aside'

            elif 'uphold' in item and 'grant' in item:
                verdict = 'Appeal Granted'

            elif 'uphold' in item and 'appeal' in item:
                verdict = 'Appeal Upheld'

            elif 'appeal' in item and 'dismiss' in item:
                verdict = 'Appeal Dismissed'

            elif 'award' in item or 'compensate' in item or 'compensation' in item:
                verdict = 'Compensation'

            elif 'suspend' in item and ('court' in item or 'sentence' in item):
                verdict = 'Suspended'
                
            elif 'sentence' in item or 'imprisonment' in item:
                verdict = 'Sentenced'
                
        return verdict, amount
    
    except UnboundLocalError:
        return np.nan, amount

In [156]:
get_verdict(df['Summary'][50])

('Compensation', [])

In [158]:
text_test

'criminal law. arrest. arrest without a warrant. criminal procedure act 51 of 1977, s 40(1)(b). assault complaint (domestic abuse). no investigation whether of such nature that amounting to sch 1 offence. arrest and detention unlawful. plaintiff later pleading guilty to assault and no malice in arrest. awarded damages of r72 000.'

In [164]:
# return space after result
# return ints after result and space

In [165]:
df['Verdict'] = df['Summary'].apply(get_verdict)

In [183]:
# df.to_csv('case_verdicts.csv')

In [185]:
df['Summary'][38]

'criminal law. murder. sentence. premeditated murder and 3 counts of assault with intent to do grievous bodily harm. deceased shot multiple times by husband on baseless suspicion of adultery. members of community assaulted on various occasions. first offender, 43 years old and 9 children within and outside of marriage. prone to committing violent crimes. substantial and compelling circumstances present, however, on conspectus of all personal circumstances. effective sentence of 23 years imposed.'

In [242]:
# double check

In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2723 entries, 0 to 2722
Data columns (total 13 columns):
Applicant         2722 non-null object
Defendant         2720 non-null object
Case No           2723 non-null object
Judges            2723 non-null object
Summary           2723 non-null object
Date Heard        2072 non-null object
Date Judgement    2318 non-null object
Court             2723 non-null object
Case Category     2723 non-null object
Province          2723 non-null object
Related Crimes    1470 non-null object
Charge            2723 non-null object
Verdict           2723 non-null object
dtypes: object(13)
memory usage: 276.6+ KB


In [245]:
df.drop_duplicates()

TypeError: unhashable type: 'list'