In [12]:
# Importing required libraries
import pandas as pd
import os
import glob

In [9]:
# Importing path for the data
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)

# Define the path to the data files
data_dir = os.path.join(grandparent_dir, 'data/educations')
print(data_dir)

/Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/educations


In [8]:
# Function to load and concatenate the csv files in our data directory defined above
def load_data(data_dir):
    '''
    This function is not memory efficient, but for <100 csv files, should work fine
    For example: this approach will have to be modified for docket/PACER data
    '''
    all_files = glob.glob(data_dir + "/*.csv")
    print(f"Found {len(all_files)} files")
    li = []
    for filename in all_files:
        try:
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    return pd.concat(li, axis=0, ignore_index=True)

In [13]:
# Define the location of csv files
path_name_csv = data_dir # our data directory doesn't have any subdirectories; this is where all csv files lie

# Load the data
df_educations = load_data(path_name_csv)

# Examine the data obtained
df_educations.info()

Found 7 files
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11520 entries, 0 to 11519
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   education_id   11520 non-null  int64  
 1   school_id      11520 non-null  int64  
 2   school_name    11520 non-null  object 
 3   person_url     11489 non-null  object 
 4   degree_level   11326 non-null  object 
 5   degree_detail  3309 non-null   object 
 6   degree_year    6697 non-null   float64
dtypes: float64(1), int64(2), object(4)
memory usage: 630.1+ KB


In [14]:
# Examining null values for each column
for col in df_educations.columns:
    print(f"Column: {col} has:") 
    print(f"{df_educations[col].nunique()} unique values")
    print(f"{df_educations[col].isnull().sum()} null values")
    print("=====================================")

Column: education_id has:
11520 unique values
0 null values
Column: school_id has:
887 unique values
0 null values
Column: school_name has:
886 unique values
0 null values
Column: person_url has:
6537 unique values
31 null values
Column: degree_level has:
11 unique values
194 null values
Column: degree_detail has:
510 unique values
8211 null values
Column: degree_year has:
213 unique values
4823 null values


In [20]:
# Extract and add the person id from the person url
# Correctly extract person_id from person_url, handling null values
df_educations['person_id'] = df_educations['person_url'].apply(lambda x: x.strip('/').split('/')[-1] if pd.notnull(x) else None)
df_educations.head()

Unnamed: 0,education_id,school_id,school_name,person_url,degree_level,degree_detail,degree_year,person_id
0,12867,3681,West Virginia University,https://www.courtlistener.com/api/rest/v4/peop...,jd,,,9009
1,12866,4581,Lewis & Clark College,https://www.courtlistener.com/api/rest/v4/peop...,jd,,,16222
2,12865,5415,University of Washington-Seattle Campus,https://www.courtlistener.com/api/rest/v4/peop...,jd,,,16221
3,12864,3556,University of Minnesota-Twin Cities,https://www.courtlistener.com/api/rest/v4/peop...,jd,,,16220
4,12863,3899,CUNY Bernard M Baruch College,https://www.courtlistener.com/api/rest/v4/peop...,ba,,,16219


In [21]:
# Saving the data for manual inspection of columns
# df_educations.to_csv("sample.csv", index=False)

### Questions to have in mind before analysis:
1. How many unique people do we have data for?
2. What % of these are judges (searching required in positions dataframe)?
3. Which universities are the biggest feeders into US judicial system?