In [8]:
# load dependencies
import pandas as pd

In [9]:
# define the filepath for the csv file to be cleaned
filepath = 'Resources/donors2021_unclean.csv'

In [10]:
# the correct encoding must be used to read the CSV in pandas
donors_2021 = pd.read_csv(filepath, encoding="ISO-8859-1")

In [11]:
# preview of the dataframe
# note that Memo_CD is likely a meaningless column
donors_2021.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount,Memo_CD
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500,
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250,
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250,
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000,
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250,


In [12]:
# delete extraneous column
del donors_2021["Memo_CD"]

donors_2021.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250


In [13]:
# identify incomplete rows
donors_2021.count()

Name        2000
Employer    1820
City        1999
State       1999
Zip         1996
Amount      2000
dtype: int64

In [14]:
# drop all rows with missing information
donors_2021 = donors_2021.dropna(how='any')

In [15]:
# Verify dropped rows
donors_2021.count()

Name        1818
Employer    1818
City        1818
State       1818
Zip         1818
Amount      1818
dtype: int64

In [16]:
# the Zip column is the wrong data type -- it should be a string (object)
donors_2021.dtypes

Name         object
Employer     object
City         object
State        object
Zip         float64
Amount        int64
dtype: object

In [17]:
# use df.astype() method to convert the datatype of the Zip column
donors_2021 = donors_2021.astype({"Zip": str}, errors='raise')

In [18]:
# Verify that the Zip column datatype has been made an object
donors_2021['Zip'].dtype

dtype('O')

In [19]:
# display an overview of the Employer column
donors_2021['Employer'].value_counts()

NOT EMPLOYED                        609
NONE                                321
SELF-EMPLOYED                       132
SELF                                 33
RETIRED                              32
                                   ... 
NOKIA CORP                            1
FH MINE SUPPLY INC.                   1
DREYER INTERNATIONAL ACADEMY LLC      1
RAY GRAHAM ASSOCIATION                1
5T WEALTH, LLC                        1
Name: Employer, Length: 519, dtype: int64

In [21]:
# clean up Employer category. Replace 'SELF' and 'SELF EMPLOYED' with 'SELF-EMPLOYED'
donors_2021['Employer'] = donors_2021['Employer'].replace({'SELF': 'SELF-EMPLOYED', 'SELF EMPLOYED': 'SELF-EMPLOYED'})

In [22]:
# verify clean-up.
donors_2021['Employer'].value_counts()

NOT EMPLOYED                            609
NONE                                    321
SELF-EMPLOYED                           180
RETIRED                                  32
INGRAM BARGE COMPANY                     30
                                       ... 
GOOGLE LLC                                1
BP INDUSTRIES INC                         1
HOT SPRINGS COUNTY DISTRICT HOSPITAL      1
INVEST AMERICA REALTY                     1
5T WEALTH, LLC                            1
Name: Employer, Length: 517, dtype: int64

In [23]:
donors_2021['Employer'] = donors_2021['Employer'].replace({'NOT EMPLOYED': 'UNEMPLOYED'})
donors_2021['Employer'].value_counts()

UNEMPLOYED                        611
NONE                              321
SELF-EMPLOYED                     180
RETIRED                            32
INGRAM BARGE COMPANY               30
                                 ... 
JEROME'S COLLISION CENTER           1
LINDQUIST MORTIARIES                1
GAINESVILLE SKIN CANCER CENTER      1
RYAN SPECIALTYGROUP                 1
5T WEALTH, LLC                      1
Name: Employer, Length: 516, dtype: int64

In [24]:
# display a statistical overview
# we can infer the maximum allowable individual contribution from 'max'
donors_2021.describe()

Unnamed: 0,Amount
count,1818.0
mean,752.127613
std,11601.791128
min,-1000.0
25%,25.0
50%,50.0
75%,200.0
max,400000.0


In [25]:
donors_2021.to_csv("donors2021.csv", index=False, encoding="ISO-8859-1")