# STAT186: Project data exploration

In [503]:
import sys
import os
import pandas as pd
import numpy as np

In [632]:
#os.listdir('..')

In [505]:
data = pd.read_csv('../stat186-projdata-2.csv', index_col = 0, low_memory=False)

In [506]:
data.shape

(171409, 341)

In [507]:
data.head(2)

Unnamed: 0,masterid,id_1,coder,coderid,casen_1,juris,first_ct,agency,agency_r,arbitrat,...,brthyr,race,gender,pajid,jpid,agedec,EWI,select,retentn,cj
0,95.010009765625,102,TEB,1130,1.0,2.0,1.0,0.0,88.0,0.0,...,1929.0,1.0,1.0,47.23,1.0,66.0,-1.454818,5.0,5.0,0.0
1,95.01001739501952,103,TEB,1130,2.0,2.0,2.0,0.0,88.0,0.0,...,1930.0,1.0,1.0,26.2,1.0,65.0,-1.454818,5.0,5.0,0.0


In [529]:
data[['decs_day', 'decs_mo', 'decs_yr']] = data[['decs_day', 'decs_mo', 'decs_yr']].fillna(0)

In [538]:
data['date'] = data['decs_day'].astype(int).astype(str) + '/' + data['decs_mo'].astype(int).astype(str) + '/' + data['decs_yr'].astype(int).astype(str)

In [539]:
data['date'].head()

0    1/12/1995
1    1/12/1995
2    29/9/1995
3    15/9/1995
4    17/3/1995
Name: date, dtype: object

In [548]:
data['date'] = pd.to_datetime(data['date'], format = '%d/%m/%Y', errors = 'coerce')

In [554]:
data.drop(['decs_day', 'decs_mo', 'decs_yr'], axis = 1, inplace = True)

In [558]:
judge_vars=['judgecode', 'brthyr','race', 'gender','pajid','jpid', 'agedec', 'EWI', 'select', 'retentn', 'cj']

## Judge data

In [562]:
jdata = data[judge_vars].groupby('judgecode').first()

In [578]:
jdata.agg({'race': 'sum', 'agedec': 'mean', 'gender': 'mean'})

gender      1.193617
agedec     58.530837
race      517.000000
dtype: float64

## State

In [581]:
data['state_2'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,  0])

In [589]:
s = pd.read_csv('../states-codes-capitals.csv', sep = ';')

In [591]:
s.head()

Unnamed: 0,STATE,CODE,CAPITAL
0,Alabama,AL,Montgomery
1,Alaska,AK,Juneau
2,Arizona,AZ,Phoenix
3,Arkansas,AR,Little Rock
4,California,CA,Sacramento


## Weather in SF from 1995 to 1998

In [601]:
w = pd.read_csv('../ca-sf.csv', low_memory = False)

In [602]:
w.shape

(38095, 92)

In [614]:
w = w.groupby(['DATE']).mean().reset_index()

In [616]:
prcp = w[['DATE', 'PRCP']]

In [619]:
sum(prcp['PRCP'] == 0)/len(prcp)

0.49623545516769335

In [621]:
prcp['DATE'] = pd.to_datetime(prcp['DATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [624]:
prcp['PRCP'] = 1*(prcp['PRCP'] > 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## Merge weather for CA

In [625]:
ca = data[data['state_2']==5]

In [628]:
ca = pd.merge(ca, prcp, left_on = 'date', right_on = 'DATE', how = 'inner')

In [631]:
ca.drop('DATE', axis = 1, inplace = True)

## Missing values

In [508]:
data.isnull().sum(axis = 0)

masterid             0
id_1                 0
coder                0
coderid              0
casen_1             21
juris               26
first_ct            24
agency              24
agency_r          2086
arbitrat          1021
dec1_yr             26
dec1_mo             26
dec1_day            26
rev_ct              26
crossapp             5
title_p1             8
title_p2           234
type_p1            346
type_p2            657
o_plain             65
o_defend          6053
state_2              0
docket            4427
docket_n          1039
reporter            16
rep_vol              0
rep_page             7
decs_yr              0
decs_mo             21
decs_day            29
                 ...  
ms_usc          168732
ms_uscty        168732
ms_uscdc        168732
ms_stc          168732
ms_stcty        168732
ms_stcdc        168732
enbanc              47
judgename        16079
judgecode           27
judgevote            0
judgeopinion         0
_jposition        2796
_jadhoc    

# Feature description

masterid<br>
id_1<br>
coder<br>
coderid<br>
casen_1<br>
juris: manner in which state supreme court takes jurisdiction<br>
first_ct: court that heard this case for the first time<br>
agency: Agency Action prior to litigation (binary)<br>
agency_r: If yes to agency, did the SSC reverse the final agency action? (binary)<br>
arbitrat: Did this case involve arbitration or mediation? (binary)<br>
dec1_yr, dec1_mo, dec1_day: What was the year/month/day of the first court decision in this case?<br>
rev_ct: Court being reviewed by the state supreme court<br>
crossapp: Are both Litigants classified as appellants? (binary)
title_p1: Case title first party listed (note: this corresponds to the first line on the template)<br>
title_p2: Case title second party listed<br>
type_p1: Is the first party listed in the title the petitioner or respondent?<br>
type_p2: Is the second party listed in the title the petitioner or respondent?<br>
o_plain: Which party was the original plaintiff (Petitioner / Respondent / Other)<br>
o_defend: Which party was the original defendant (Petitioner / Respondent / Other)<br>
state_2: state in USA (encoded by number from 1 to 50)<br>
docket: docket number (don't know what this is)<br>
docket_n: Number of docket numbers in this opinion.<br>
reporter: Regional Reporter<br>
rep_vol: volume number of reporter<br>
rep_page<br>
decs_yr, decs_mo, decs_day: Date of supreme court decision<br>
classact: Is this case a class action suit? (binary) <br>
amicus: Were amicus curiae briefs filed in the supreme court? (binary)<br>
multi_p: Are there multiple petitioners/appellants in this case? (binary)<br>
multi_r: Are there multiple respondents/appellees in this case? (binary)<br>

## Petitioners and appellants
#### First petitioner/appellant: 
- p1_persn: Natural person first appellant encoded by a number, 46 categories (eg: child, student, wife, public official)
- p1_busi: private business first petitioner encoded by number for type of business (eg: agriculture, bank, etc)<br>
- p1_org: Private organization or association first appellant encoded by a number for type of organization (eg: Business or trade association, Charitable or philanthropic organization, etc)
- p1_lgov: Local or county government first appellant encoded by a number for type of government org (eg: Legislative, School board, board of trustees for college or junior college)
- p1_sgov: State government first appellant encoded by number for type (eg: Executive/Administrative, Police)
- p1_edu: Educational institution categories first appellant encoded by number (eg: Private, Non-Religious - Pre-K)
- p1_other: Other first appellant (binary)
- p1_otext: Description of other litigant type

#### Second petitioner/appellant:
- p2_persn, p2_busi, p2_org, p2_lgov, p2_sgov, p2_edu, p2_other, p2_otext: idem

#### First respondent
- r1_persn, r1_busi, r1_org, r1_lgov, r1_sgov, r1_edu, r1_other, r1_otext: idem

#### Second respondent
- r2_persn, r2_busi, r2_org, r2_lgov, r2_sgov, r2_edu, r2_other, r2_otext: idem

## Public Defender

pubdef: For criminal cases only, was the criminal defendant represented before the supreme court by a public defender or a state appointed counsel? (binary)

## Case Classification
genissue: general issue classification (Criminal Cases, Civil Government, Civil Private, Juvenile, Non-adversarial cases)
### Criminal Cases
##### Issue Confidence Scale
ca_conf: On a scale of one to ten, how confident are you with the classification of this case as a criminal appeal (where ten indicates most confident and one indicates least confident)?<br>
### Binary variables for crime types
I did not explain the variable when the explanation is just the name of the variable.

assaulta: Assault - aggravated<br>
assaulto: Assault - other<br>
arson: <br>
burglary<br>
disorder: disorderly conduct<br>
dui: Driving under the influence<br>
drugabus: drug abuse violations (including possession)<br>
drugsell: Drug trafficking<br>
drunk: drunkenness<br>
embezzle: embezzlement<br>
forge: Forgery and counterfeiting<br>
fraud: <br>
gamble<br>
kidnap<br>
theft: Larceny/Theft<br>
liquor: liquor laws<br>
mans_neg: Manslaughter - negligent<br>
mans_non: Manslaughter - non-negligent<br>
autothft: Motor vehicle theft<br>
murder<br>
fam_kids: Offenses against family and children<br>
prostitu: Prostitution and commercialized vice<br>
rape: Rape/Sexual assault<br>
robbery<br>
sex_gen: Sex offenses (except rape and prostitution)<br>
stolen: Stolen property - buying, receiving, possessing<br>
traffic: Traffic offenses (other than DUI)<br>
vagrancy<br>
vandal: Vandalism<br>
weapons: Weapons - carrying, possessing, concealing<br>
con_oth: Other conviction reviewed?<br>
ca_tot_c: Total number of convictions reviewed?<br>

##### Identification confidence scale
con_conf: On a scale of one to ten, how confident are you with your identification of the convictions reviewed in this case (where ten indicates most confident and one indicates least confident)?<br>

#### Identification of Death Penalty Cases
death_c: Was the death penalty considered but not imposed? (binary) <br>
death_im: Was the death penalty imposed by the trial court? (binary) <br>

#### Pre- and Post-Conviction Criminal Cases
bail: Bail (binary) <br>
parole: Probation and parole (binary)<br>

#### Legal Issues in Criminal Cases
Binary variables for legal issues in criminal cases.

ca_atty: councel issues (binary)<br>
ca_cruel: Cruel and unusual punishment (binary)<br>
ca_disc: Discovery<br>
ca_doubj: Double jeopardy<br>
ca_entra: Entrapment<br>
ca_ev_m: Evidence - material<br>
ca_ev_w: Evidence - expert or witness testimony<br>
ca_gjury: Grand jury indictment<br>
ca_insan: Insanity defense or arguments of mental incompetence<br>
ca_jr_in: Jury instruction<br>
ca_jr_sl: Jury selection<br>
ca_plea: Plea bargaining or guilty plea<br>
ca_prej: Prejudicial conduct by prosecutor<br>
ca_race: Racial discrimination against defendant<br>
ca_recus: Recusal by judge or other official issues<br>
ca_serch: Search and seizure<br>
ca_self: Self-incrimination<br>
ca_sent: Sentencing (including proportionality)<br>
ca_sevr: Severance of defendants for trial<br>
ca_speed: Speedy trial<br>
ca_suff: Sufficiency of evidence<br>
ca_trial: Trial by jury<br>
ca_venue: venue<br>
ca_othis: ???<br>
ca_tot_i: Total number of legal issues addressed by the Supreme Court (not binary)<br>
##### Issue Confidence Scale
ca_lconf: On a scale of one to ten, how confident are you with your classification of the legal issues raised in this case (where ten indicates most confident and one indicates least confident)?<br>

### Civil private cases
##### Confidence Scale
cp_conf: scale conf for classification<br>

### Binary variables for civil private cases types
##### Domestic
gen_dom: Domestic relations<br>
adopt: adoption<br>
civilcom: civil commitment<br>
divorce: separation and divorce<br>
childsup: child support an custody<br>
domvioln: domestic violence - restraining orders<br>
invlster: involuntary sterilization<br>
paternty: Paternity<br>
dom_oth: other domestic relations issues<br>
##### Estates
gen_est: Estates<br>
will: probate, wills, intestate estates<br>
trustee: guardianship, conservatorship, trusteeship<br>
est_oth: other estate issues<br>
##### Contracts
gen_cont: contracts general category<br>
cp_enfor: enforcement (breech, specific performance)<br>
relation: debtor-creditor relations<br>
insure: Insurance<br>
tenant: tenant-landlord<br>
ownershp: Ownership of real property<br>
cont_oth: other contract issues<br>
##### Torts
gen_ptor: torts general category - civil private<br>
malprc_m: medical malpractice<br>
malprc_p: Professional malpractice<br>
automobl: automobiles<br>
products: product liability<br>
toxic: toxic substance<br>
cp_prem: Premises liability<br>
cp_libel: libel/slander/defamation<br>
cp_workr: Employee injury and workers’ compensation<br>
cp_disc: Employment discrimination<br>
cp_oth_l: other labor disputes<br>
ptor_oth: other torts<br>
Description of other<br>
cp_i_cnf: On a scale of one to ten, how confident are you with your identification of the issues reviewed in this case (where ten indicates most confident and one indicates least confident)?<br>

### Civil government cases
cg_conf: On a scale of one to ten, how confident are you with the classification of this case as civil government (where ten indicates most confident and one indicates least confident)?<br>
##### Elections
gen_elec: Elections<br>
redist: Apportionment and redistricting<br>
elect: Contested elections<br>
ballot: Ballot access<br>
campaign: campaign spending<br>
elec_oth: other election issues<br>

##### First amendment
gen_frst: first amendment issues <br>
aid2paro: aid to parochial schools <br>
comspch: commercial speech <br>
religion: free exercise of religion <br>
cg_libel: libel/slander/defamation <br>
oath: loyalty oath <br>
obscene: obscenity <br>
protest: protest/marches/picketing <br>
frst_oth: other first amendment issues<br>

##### Government regulation
gen_greg: government regulation <br>
consumer: consumer protection <br>
emdomain: eminent domain <br>
environm: environmental protection <br>
welfare: government benefits/welfare/Medicaid<br>
license: licensing and permits <br>
taxes: taxation <br>
transpt: transportation <br>
utility: utilities regulation <br>
zoning: zoning and planning <br>
greg_oth: other governmental regulation<br>

##### Law practice
gen_law: practice of law <br>
baradmit: bar admission <br>
disp_att: disciplinary proceedings against attorneys<br>
disp_jud: disciplinary proceedings against judge <br>
rules: promulgation of rules of practice <br>
law_oth: other practice of law issues<br>

##### Public contracts
gen_pubc: public contracts <br>
affirmac: affirmative action/minority set asides<br>
cg_enfor: contract enforcement (breech, specific performance)<br>
cg_disc: employment discrimination<br>
pubc_oth: other public contract issues<br>

##### Privacy issues
gen_priv: privacy issues <br>
abortion: abortion <br>
foia: access to information <br>
gayright: homosexual rights <br>
drugtest: mandatory drug testing <br>
steriliz:  mandatory sterilization <br>
dieright: right to die <br>
priv_oth: other privacy issues <br>

##### Torts
gen_gtor: torts <br>
cg_workr: employee injury and workers’ compensation<br>
cg_prem: premises liability <br>
gtor_oth: other torts <br>

cg_i_cnf: On a scale of one to ten, how confident are you with your identification of the issues reviewed in this case (where ten indicates most confident and one indicates least confident)?<br>

### Legal Issues in Civil Cases

##### Abuse of discretion by trial judge
cv_abuse: abuse of discretion by trial judge<br>
cv_arbit: arbitrary or capricious standard<br>
cv_erro: clearly erroneous standard	<br>
cv_atty: counsel (including attorney’s fees)	<br>
cv_disc: discovery <br>
cv_dscm: discrimination	<br>
cv_ev_m: evidence - material 	<br>
cv_ev_w: evidence - expert or witness testimony<br>
cv_claim: failure to state a claim <br>
cv_immun: government immunity <br>
cv_jr_in: jury instructions <br>
cv_jr_sl: jury selection <br>
cv_moot: mootness	<br>
cv_ripe: ripeness/failure to exhaust administrative remedies	<br>
cv_recus: recusal by judge or other official<br>
cv_stand: standing to sue <br>
cv_suff: sufficiency of evidence <br>
cv_sumju: summary judgement <br>
cv_injun: validity of injunction <br>
cv_venue: venue <br>
cv_othis: other legal issues <br>
cv_ot_i: total number of legal issues addressed by the supreme court	<br>

cv_lconf: on a scale of one to ten, how confident are you with your classification of the legal issues raised in this case (where ten indicates most confident and one indicates least confident)?<br>

### Decisions in Criminal Cases

##### Features of the Opinion
ca_majfm: form of the court majority<br>
ca_opnfm: form of the court’s opinion <br>
ca_disp: supreme Court’s disposition of the lower court	<br>
ca_winp: winning party in the supreme court	<br>
ca_conv: did the supreme court overturn any convictions in this case?	<br>
ca_sento: did the supreme court overturn a sentence in this case?<br>
ca_capo: did the supreme court overturn a death sentence in this case?	<br>

##### Rulings on Constitutionality
ca_usc: is there a constitutional challenge to a law passed by another branch of government under the U.S. Constitution? <br>
ca_uscty: if yes, what type of law was challenged?<br>
ca_uscdc: if yes, was the law declared unconstitutional?<br>

##### Constitutionality: State Constitution
ca_stc: is there a constitutional challenge to a law passed by another branch of government under the state constitution?	<br>
ca_stcty: if yes, what type of law was challenged?<br>
ca_stdc: if yes, was the law declared unconstitutional?	<br>


### Decisions in Civil Private Cases

##### Form of the court’s opinion
cp_majfm: form of the court majority <br>
cp_opnfm: form of the court’s opinion <br>
cp_disp: supreme court’s disposition of lower court<br>
cp_winp: winning party in the supreme court <br>
cp_verdo: did the supreme court reduce liability or overturn a verdict?<br>
cp_awrdo: did the supreme court reduce or overturn a monetary award?<br>
cp_injo: did the supreme court overturn an injunction or other order?	<br>

##### Rulings on US Constitutionality
cp_usc: is there a constitutional challenge to a law passed by another branch of government under the US Constitution?	<br>
cp_uscty: if yes, what type of law was challenged?<br>
cp_uscdc: if yes, was the law declared unconstitutional? <br>

##### Rulings on State Constitutionality
cp_stc: is there a constitutional challenge to a law passed by another branch of government under the state <br>
cp_stcty: if yes, what type of law was challenged?<br>	
cp_stcdc: if yes, was the law declared unconstitutional?<br>

### Decisions in Civil Government Cases

##### Form of the court majority
cg_majfm: form of the court majority<br>
cg_opnfm: form of the court’s opinion<br>
cg_disp: supreme court’s disposition of lower court	<br>
cg_winp: winning party in the supreme court	<br>
cg_verdo: did the supreme court reduce liability or overturn a verdict?<br>
cg_awrdo: did the supreme court reduce or overturn a monetary award?<br>
cg_injo: did the supreme court overturn an injunction or other order?<br>

##### Rulings on US Constitutionality
cg_usc: is there a constitutional challenge to a law passed by another branch of government under the U.S. Constitution? <br>
cg_uscty: if yes, what type of law was challenged? <br>
cg_uscdc: if yes, was the law declared unconstitutional?<br>

##### Rulings on State Constitutionality
cg_stc: is there a constitutional challenge to a law passed by another branch of government under the state constitution?<br>
cg_stcty: if yes, what type of law was challenged?<br>
cg_stcdc: if yes, was the law declared unconstitutional?<br>

### Juvenile Cases

jv_conf: on a scale of one to ten, how confident are you with the classification of this case as a juvenile case (where ten indicates most confident and one indicates least confident)?	<br>

##### Characteristics of cases
gen_del: delinquency <br>
del_viol: delinquency - violent offenses<br>
del_prop: delinquency - property offenses<br>
del_drug: delinquency - drug offenses<br>
del_ord: delinquency - publicorder offenses	<br>
del_oth: delinquency - other <br>
truant: truancy <br>
curfew: curfew <br>
runaway: runaway <br>
gangrel: were any of the juvenile’s activities gang related? <br>

#####  Decisions in Juvenile Cases
jv_majfm: form of the court majority<br>
jv_opnfm: form of the court’s opinion<br>
jv_disp: supreme court’s disposition of the lower court <br>
jv_winp: winning party in the supreme court <br>
jv_delo: did the supreme court overturn a finding of delinquency? <br>
jv_probo: did the supreme court overturn a probation/confinement sentence? <br>

##### Rulings on US Constitutionality
jv_usc: is there a constitutional challenge to a law passed by another branch of government under the US Constitution? <br>
jv_uscty: if yes, what type of law was challenged? <br>
jv_uscdc: if yes, was the law declared unconstitutional?<br>

##### Rulings on State Constitutionality
jv_stc: is there a constitutional challenge to a law passed by another branch of government under the state constitution? <br>
jv_stcty: if yes, what type of law was challenged? <br>
jv_stcdc: if yes, was the law declared unconstitutional? <br>

### Miscellaneous Cases

##### Non-Adversarial Cases
ms_conf: on a scale of one to ten, how confident are you with the classification of this case as nonadversarial (where ten indicates most confident and one indicates least confident)?<br>

##### Types of Non-Adversarial Cases
certific: certification <br>
cert_ct: if certification, which court is certifying this case<br>
advisory: advisory opinion <br>
non_bar: non-adversarial matters regarding the bar <br>
non_oth: other non-adversarial cases <br>

##### Decisions in Non-Adversarial Cases
ms_majfm: form of the court majority <br>
ms_opnfm: form of the court’s opinion <br>

##### Rulings on US Constitutionality
ms_usc: is there a constitutional challenge to a law passed by another branch of government under the US Constitution?<br>
ms_uscty: if yes, what type of law was challenged? <br>
ms_uscdc: if yes, was the law declared unconstitutional? <br>

##### Rulings on State Constitutionality
ms_stc: is there a constitutional challenge to a law passed by another branch of government under the state constitution? <br>
ms_stcty: if yes, what type of law was challenged? <br>
ms_stcdc: if yes, was the law declared unconstitutional? <br>

### Sitting justice involvement and behavior
This section contains information for justices 1-9. justice 1 is used as an example below: <br>

enbanc: did the court sit en banc? <br>
j1_name: justice name - 1 <br>
j1_vote: justice 1 vote <br>
j1_opin: justice 1 opinion behavior <br>

### Substitute / ad hoc justice involvement and behavior
This section contains information regarding substitute / ad hoc justices 1-3. Substitute / ad hoc justice 1 is used as an example below: <br>

s1_name: ad hoc justice 1 vote  <br>
s1_text: name of ad hoc justice 1 that does not appear on list  <br>
s1_type: is ad hoc justice 1 retired, a substitute, or other? <br>
s1_vote: ad hoc justice 1 vote  <br>
s1_opin: ad hoc justice 1 opinion behavior   <br>

### To do

opassign: <br>
disordr: <br>
votordr: <br>
belitid: <br>
bmassid: <br>
brthyr: year of birth of the justice<br>
race: race of justice<br>
gender: gender of justice - 1: male, 0: female<br>
pajid: <br>
jpid: <br>
agedec: age at the time of the decision<br>
EWI: <br>
select: <br>
retentn: <br>
cj: <br>

## Criminal cases

This should only select criminal cases:

In [511]:
data['id_1'].nunique()

322

In [510]:
data['_jadhoc'].unique()

array([0., 1.])

In [509]:
data['_jposition'].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  9., nan])

In [512]:
# code of judge 1
data['judgecode'].nunique()

472

In [239]:
dna = data[data['genissue']==1]

In [212]:
dna = dna.replace(88, np.nan)

In [213]:
dna.shape

(55601, 341)

In [214]:
dna = dna.dropna(how = 'all', axis = 1)

In [215]:
dna.shape

(55601, 311)

In [65]:
#[c for c in data.columns if c not in dna.columns]

In [66]:
dna[dna['cp_conf']==10]['ca_conf']

23156     10.0
50888     10.0
78705     10.0
105922    10.0
131773    10.0
149299    10.0
163430    10.0
Name: ca_conf, dtype: float64

In [82]:
dna[dna['elect']==1]['arson']

13013     0.0
40857     0.0
68638     0.0
96058     0.0
122406    0.0
143580    0.0
158281    0.0
166802    0.0
Name: arson, dtype: float64

In [240]:
cat = pd.read_csv('../crime_cat.csv', header=None, names = ['cat'])

In [241]:
dna[[c for c in cat['cat']]].sum(axis = 0)

assaulta     3201.0
assaulto     2155.0
arson         620.0
burglary     3476.0
disorder      381.0
dui          2700.0
drugabus     4682.0
drugsell     2567.0
drunk          37.0
embezzle       56.0
forge         275.0
fraud         378.0
gamble         14.0
kidnap       2212.0
theft        2433.0
liquor         66.0
mans_neg      679.0
mans_non      563.0
autothft      312.0
murder      20097.0
fam_kids     2258.0
prostitu       60.0
rape         5012.0
robbery      6065.0
sex_gen      1636.0
stolen        437.0
traffic       643.0
vagrancy       12.0
vandal         44.0
weapons      4337.0
con_oth      8239.0
dtype: float64

In [242]:
idx = dna[[c for c in cat['cat']]].sum(axis = 1) == 0 # cases with no category....
dna[idx].shape

(7651, 341)

In [245]:
dna[dna['ca_tot_c']==0].shape

(9007, 341)

In [286]:
data['decs_yr'].unique()

array([1995., 1996., 1997., 1998.,   nan])

In [246]:
zero_con = dna[dna['ca_tot_c']==0]

In [253]:
zero_con[['decs_day', 'decs_mo', 'decs_yr', 'dec1_day']]

Unnamed: 0,decs_day,decs_mo,decs_yr,dec1_day
184,22.0,9.0,1995.0,88.0
189,28.0,7.0,1995.0,88.0
334,7.0,2.0,1995.0,99.0
353,20.0,4.0,1995.0,99.0
402,2.0,10.0,1995.0,99.0
415,27.0,2.0,1995.0,99.0
421,27.0,2.0,1995.0,99.0
492,6.0,3.0,1995.0,6.0
495,27.0,3.0,1995.0,99.0
496,18.0,9.0,1995.0,99.0


In [224]:
dna[idx].groupby(['ca_tot_c']).count()

Unnamed: 0_level_0,masterid,id_1,coder,coderid,casen_1,juris,first_ct,agency,agency_r,arbitrat,...,ms_stc,enbanc,judgename,judgecode,judgevote,judgeopinion,_jposition,_jadhoc,adhoctype,adhoctext
ca_tot_c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,7578,7578,7578,7578,7515,7570,7573,7485,216,7540,...,0,7578,6896,7471,7564,7578,7490,7578,88,88
1.0,60,60,60,60,60,60,60,53,0,60,...,0,60,54,59,60,60,60,60,0,0


In [144]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 1
dna[idx].shape

(29924, 311)

In [145]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 2
dna[idx].shape

(11145, 311)

In [146]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 3
dna[idx].shape

(4426, 311)

In [147]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 4
dna[idx].shape

(1567, 311)

In [148]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 6
dna[idx].shape

(85, 311)

In [153]:
idx = dna[[c for c in cat['cat'] if c in dna.columns]].sum(axis = 1) == 7
dna[idx].shape

(37, 311)

In [156]:
dna[idx]['ca_tot_c']

1760       6.0
1863       7.0
7795      14.0
18350      8.0
19160      7.0
19275     11.0
29544      6.0
29650      7.0
35669     14.0
46095      8.0
46902      7.0
47017     11.0
57277      6.0
57383      7.0
63429     14.0
73910      8.0
74719      7.0
74834     11.0
85051      6.0
85149      7.0
90987     14.0
101259     8.0
102053     7.0
102168    11.0
112086     6.0
112187     7.0
117606    14.0
128177     7.0
128290    11.0
137161     6.0
137266     7.0
140698    14.0
147277     7.0
147359    11.0
152974     6.0
153078     7.0
155934    14.0
Name: ca_tot_c, dtype: float64

In [None]:
[1., 0., 2., 5., 3., 4., 6.]

In [203]:
for c in data.columns:
    if 'judge' in c:
        print(c)
#data[['judgename','judgecode']]

judgename
judgecode
judgevote
judgeopinion


In [206]:
data['judgeopinion'].dropna().shape

(171346,)

In [207]:
data['judgename'].dropna().shape

(155275,)

In [208]:
data['judgevote'].dropna().shape

(171346,)

In [209]:
data['judgecode'].dropna().shape

(171319,)

In [210]:
data.shape

(342749, 341)

In [228]:
data['id_1'].nunique()

320

In [501]:
data[['parole', 'masterid']].groupby(['parole']).nunique()

Unnamed: 0_level_0,parole,masterid
parole,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1,27824
1.0,1,282
88.0,1,32


In [328]:
judge_data = data[171346:342749]

In [331]:
judge_data.dropna(how = 'all', axis = 1).drop_duplicates().head()

Unnamed: 0,opassign,disordr,votordr,belitid,bmassid,brthyr,race,gender,pajid,jpid,agedec,EWI,select,retentn,cj
171346,6.0,3.0,10.0,35.325,33.43,1929.0,1.0,1.0,47.23,1.0,66.0,-1.454818,5.0,5.0,0.0
171347,6.0,3.0,10.0,35.325,33.43,1930.0,1.0,1.0,26.2,1.0,65.0,-1.454818,5.0,5.0,0.0
171348,6.0,3.0,10.0,35.325,33.43,1937.0,1.0,1.0,30.69,1.0,58.0,-1.454818,5.0,5.0,0.0
171349,6.0,3.0,10.0,35.325,33.43,1936.0,1.0,1.0,45.16,1.0,59.0,-1.454818,5.0,5.0,1.0
171353,6.0,3.0,10.0,35.325,33.43,1952.0,1.0,1.0,45.16,1.0,43.0,-1.454818,5.0,5.0,0.0


In [384]:
judge_data = judge_data.dropna(how = 'all', axis = 1).reset_index(drop=True)

In [329]:
case_data = data[0:171346]

In [385]:
case_data = case_data.dropna(how = 'all', axis = 1).reset_index(drop=True) #.drop_duplicates().head()

In [428]:
judge_data.shape, case_data.shape

((171403, 15), (171346, 326))

In [430]:
# case_data.iloc[-50:-25]['judgename']

In [426]:
judge_data['opassign'].unique()

array([ 6.,  9., 10.,  3.,  7.,  8.,  2.,  4.,  5.,  1.])

In [427]:
judge_data.shape

(171403, 15)

In [433]:
case_data['id_1'].isnull().sum()

0

In [476]:
case_data[case_data['_jposition']==6].shape

(16016, 326)

Cases in Alabama.

In [488]:
alabama = case_data[case_data['state_2'] == 1.0]

In [489]:
alabama = alabama[alabama['_jposition']==1]

In [490]:
alabama.shape

(797, 326)

In [492]:
alabama = alabama[alabama['genissue']==1]

In [494]:
alabama.shape

(98, 326)

In [499]:
alabama[alabama['parole']==1]

Unnamed: 0,masterid,id_1,coder,coderid,casen_1,juris,first_ct,agency,agency_r,arbitrat,...,ms_stcdc,enbanc,judgename,judgecode,judgevote,judgeopinion,_jposition,_jadhoc,adhoctype,adhoctext
4,95.010048,106.0,TEB,1130.0,5.0,2.0,2.0,0.0,88.0,0.0,...,,0.0,109.0,109.0,1.0,1.0,1.0,0.0,,
189,95.011902,291.0,TEB,1130.0,154.0,2.0,2.0,0.0,88.0,0.0,...,,0.0,106.0,106.0,1.0,1.0,1.0,0.0,,


In [470]:
case_data['masterid'].nunique()

28142

In [502]:
judge_data.shape

(171403, 15)

In [487]:
judge_data.isnull().sum(axis = 1)[judge_data.isnull().sum(axis = 1) > 8]

43409     9
124824    9
124831    9
124836    9
124842    9
168677    9
168678    9
168679    9
168680    9
168681    9
168682    9
168683    9
168684    9
168685    9
168686    9
168687    9
168688    9
168689    9
168690    9
168691    9
168692    9
168693    9
168694    9
168695    9
168697    9
168698    9
168699    9
168700    9
169810    9
169811    9
         ..
169813    9
169814    9
169815    9
169816    9
169817    9
169818    9
169819    9
169820    9
169821    9
169822    9
169823    9
169824    9
169825    9
170291    9
170292    9
170293    9
170294    9
170295    9
170708    9
170709    9
170710    9
170711    9
171025    9
171026    9
171027    9
171028    9
171029    9
171031    9
171187    9
171362    9
Length: 61, dtype: int64