# Community Based Immigration Organizations

## Data Sources

- [Great Nonprofits](https://greatnonprofits.org)



In [None]:
import re
import pandas as pd
from pathlib import Path
from datetime import datetime


### File Locations

In [None]:
today = datetime.today()
TARGET_FOLDER='data/raw'
DESTINATION_FOLDER='data/processed'
FILENAME='all-states-non-profits.csv'
in_file = Path(TARGET_FOLDER)/FILENAME
summary_file = Path(DESTINATION_FOLDER)/f'{FILENAME}-{today:%b-%d-%Y}.csv'

In [None]:
df = pd.read_csv(in_file)

## Column Cleanup

- Rename columns for consistency
- get first and last names from full name column
- remove non-numeric characters and spaces from phone number columns
- delete unnecessary columns
- re-arrange columns

In [None]:
cols_to_rename = {'website': 'Website', 'profile picture-src': 'Profile Picture',
                  'street address': 'Address', 'city': 'City', 'state': 'State', 'zip': 'Zip', 'country': 'Country'}
df.rename(columns=cols_to_rename, inplace=True)


# Clean description

- keep description text only and remove spaces from start and end of the string

In [None]:
def get_description(s: str) -> str:
    if isinstance(s, str):
        if 'Causes:' in s and ' Mission:' in s:
            split_str = s.split(' Mission:')
            split_str[1] = 'Mission:' + split_str[1]
            s = str(split_str[1])
        elif 'Causes:' in s and ' Programs:' in s:
            split_str = s.split(' Programs:')
            split_str[1] = 'Programs:' + split_str[1]
            s = str(split_str[1])
        return s.replace('Causes:', '').replace('\n', ' ').strip()


In [None]:
def get_services(s: str) -> str:
    if isinstance(s, str):
        if 'Causes:' in s and ' Mission:' in s:
            split_str = s.split(' Mission:')
            split_str[1] = 'Mission:' + split_str[1]
            s = str(split_str[0])
        elif 'Causes:' in s and ' Programs:' in s:
            split_str = s.split(' Programs:')
            split_str[1] = 'Programs:' + split_str[1]
            s = str(split_str[0])
        return s.replace('Causes:', '').replace('\n', ' ').strip()


In [None]:
df['Description'] = df['overview'].apply(get_description)
df['Services'] = df['overview'].apply(get_services)
# df

# Get phone numbers from office info

- get toll-free number
- get telephone number
- get fax number

In [None]:
# remove non-numeric characters
# https://stackoverflow.com/questions/17336943/removing-non-numeric-characters-from-a-string
def remove_non_numeric_chars(s: str):
    return re.sub('[^0-9]', '', s) if type(s) is str else None


In [None]:
# format phone numbers
def clean_phone_num(s):
    return remove_non_numeric_chars(s)

In [None]:
# update df with clean phone and fax numbers
df['Phone'] = df['Phone'].apply(clean_phone_num)


# More cleaning

- drop unnecessary columns
- drop rows with missing data based on criteria provided
- add missing columns with default values
- re-arrange columns to desired order

In [None]:
# drop unnecessary columns
cols_to_drop = ['web-scraper-order', 'web-scraper-start-url', 'link', 'link-href', 'website-href', 'pagination', 'overview']
df = df.drop(cols_to_drop, axis='columns')

In [None]:
# delete rows with missing values
row_missing_values = ['Phone', 'Website', 'Address']
df = df.dropna(how='all', subset=row_missing_values)


In [None]:
# add missing
df['Email'] = ''


In [None]:
# re-arrange columns
rearrange_cols = ['Organization Name', 'EIN Number', 'Address', 'Country', 'State', 'City', 'Description', 'Email', 'Phone', 'Website', 'Services', 'Profile Picture']
df = df[rearrange_cols]

# Save cleaned data

- export df as csv

In [None]:
# df.to_csv(summary_file)
df