# Immigration Lawyers

## Data Sources

- [Justia](https://www.justia.com)

In [None]:
import re
import json
import pandas as pd
from pathlib import Path
from datetime import datetime

### File Locations

- review folder structure and provide filenames

In [None]:
today = datetime.today()
TARGET_FOLDER='data/raw'
DESTINATION_FOLDER='data/processed'
FILENAME='all-states-justia-lawyers.csv'
in_file = Path(TARGET_FOLDER)/FILENAME
summary_file = Path(DESTINATION_FOLDER)/f'{FILENAME}-{today:%b-%d-%Y}.csv'

In [None]:
df = pd.read_csv(in_file)

## Column Cleanup

- rename columns

In [None]:
cols_to_rename = {'lawyer':'Full Name', 'Website':'non-website', 'Website-href':'Website', 'Profile Picture-src': 'Profile Picture', 'Office-info':'Office'}
df.rename(columns=cols_to_rename, inplace=True)

In [None]:
def clean_description(s:str):
    return s.split('Biography')[1].strip() if type(s) is str and "Biography" in s else None
        

In [None]:
df['Description'] = df['Description'].apply(clean_description)

# Get phone numbers from office info

- get toll-free number
- get telephone number
- get fax number

In [None]:
# remove non-numeric characters
# https://stackoverflow.com/questions/17336943/removing-non-numeric-characters-from-a-string
def remove_non_numeric_chars(s: str):
    return re.sub('[^0-9]', '', s) if type(s) is str else None


In [None]:
# get telephone number if exists
def get_tel_num(s):
    obj = json.loads(s)
    info = obj[0]['Office-info']
    return remove_non_numeric_chars(info.split('Telephone:')[1].split(':')[0]) if "Telephone:" in info else None


In [None]:
# get fax number if exists
def get_fax_num(s):
    obj = json.loads(s)
    info = obj[0]['Office-info']
    return remove_non_numeric_chars(info.split('Fax:')[1].split(':')[0]) if "Fax:" in info else None


In [None]:
# get toll-free numer if exists
def get_toll_free_num(s):
    obj = json.loads(s)
    info = obj[0]['Office-info']
    return remove_non_numeric_chars(info.split('Toll-Free:')[1].split(':')[0]) if "Toll-Free:" in info else None


In [None]:
# update df with formatted phone and fax numbers
df['Phone Number'] = df['Office'].apply(get_tel_num)
df['Fax'] = df['Office'].apply(get_fax_num)
df['Mobile Number'] = df['Office'].apply(get_toll_free_num)

# More column cleanup 

- drop columns that are not needed in the final df
- drop rows with null values based on criteria
- add default columns
- re-arrange columns

In [None]:
# drop unnecessary columns
cols_to_drop = ['web-scraper-order', 'web-scraper-start-url',
                'lawyer-href', 'non-website', 'Office', 'Office-info-class']
df = df.drop(cols_to_drop, axis='columns')


In [None]:
# delete rows with missing values in all of the selected columns
row_missing_values = ['Phone Number', 'Website', 'Address']
df = df.dropna(how='all', subset=row_missing_values)


In [None]:
# add default columns to dataframe
df['Country'] = 'USA'
df['Email'] = ''

In [None]:
# re-arrange columns
rearrange_cols = ['Name of Business', 'Address', 'Country', 'State', 'City', 'Description',
                  'Email', 'Phone Number', 'Website', 'Fax', 'Full Name', 'Mobile Number', 'Profile Picture']
df = df[rearrange_cols]


In [None]:
# save cleaned data
df.to_csv(summary_file)
# df.info()