# Dealing with unclean data

We're going to look at data that may require some cleansing.

In [1]:
import pandas as pd

## Read the admissions data that is not so clean

In [2]:
data_location = "../data/admission-data-dirty.csv"
#data_location = 'https://github.com/elephantscale/datasets/raw/master/college-admissions/admission-data-dirty.csv'
admissions = pd.read_csv(data_location)

print ("admissions shape : ", admissions.shape)
admissions

data_location: ../data/admission-data-dirty.csv
admissions shape :  (20, 4)


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
5,0.0,,3.35,
6,1.0,520.0,,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,,600.0,2.82,4


## Get Summary
See what we get.  It will skip null values

In [3]:
admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   18 non-null     float64
 1   gre     19 non-null     float64
 2   gpa     19 non-null     float64
 3   rank    19 non-null     object 
dtypes: float64(3), object(1)
memory usage: 768.0+ bytes


### Count Nulls per column

In [4]:
admissions.isnull().sum()

admit    2
gre      1
gpa      1
rank     1
dtype: int64

In [5]:
admissions.describe(include = 'all')

Unnamed: 0,admit,gre,gpa,rank
count,18.0,19.0,19.0,19.0
unique,,,,5.0
top,,,,2.0
freq,,,,10.0
mean,0.5,594.736842,3.499474,
std,0.514496,109.309368,0.353467,
min,0.0,400.0,2.82,
25%,0.0,510.0,3.25,
50%,0.5,600.0,3.56,
75%,1.0,690.0,3.715,


In [6]:
## TODO : Describe more than one column : gre and gpa
## Hint : add 'gpa' column
admissions[['gre', 'gpa']].describe()

Unnamed: 0,gre,gpa
count,19.0,19.0
mean,594.736842,3.499474
std,109.309368,0.353467
min,400.0,2.82
25%,510.0,3.25
50%,600.0,3.56
75%,690.0,3.715
max,800.0,4.0


## Drop all null values

In [7]:
print("raw data shape : ", admissions.shape)
dropped_na = admissions.dropna()
print()
print("after drop shape : ", dropped_na.shape)
dropped_na


raw data shape :  (20, 4)

after drop shape :  (16, 4)


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
10,1.0,500.0,3.6,3
11,0.0,500.0,3.95,4
13,1.0,560.0,3.59,2


In [8]:
# only drop nulls from admit & gre column
print("raw data shape : ", admissions.shape)

print()

dropped2 = admissions.dropna(subset=['admit', 'gre'])
print("after drop shape : ", dropped2.shape)
dropped2

raw data shape :  (20, 4)

after drop shape :  (17, 4)


Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
6,1.0,520.0,,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
10,1.0,500.0,3.6,3
11,0.0,500.0,3.95,4


## Fill in the values

In [9]:
# fill every thing with zero
zero_fill = admissions.fillna(0)
zero_fill

Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
5,0.0,0.0,3.35,0
6,1.0,520.0,0.0,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,0.0,600.0,2.82,4


In [10]:
# or we can specify per column default value
## TODO : specify different default values per column
fill2 = admissions.fillna({'admit': -1, 
                           'gre':admissions['gre'].mean() , 
                           'gpa':admissions['gpa'].mean(), 
                           'rank':10})
fill2

Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
5,0.0,594.736842,3.35,10
6,1.0,520.0,3.499474,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,-1.0,600.0,2.82,4


## Replace values

In [11]:
print (admissions)

admissions2 = admissions.copy(deep=True)
admissions2['gre'].replace(800, 1000, inplace=True)

print()
print (admissions2)

    admit    gre   gpa rank
0     1.0  400.0  3.23    4
1     1.0  700.0  3.56    1
2     1.0  800.0  4.00    2
3     0.0  500.0  3.53    4
4     0.0  560.0  3.78    2
5     0.0    NaN  3.35  NaN
6     1.0  520.0   NaN    3
7     0.0  440.0  3.17    2
8     1.0  760.0  3.00    2
9     NaN  600.0  2.82    4
10    1.0  500.0  3.60    3
11    0.0  500.0  3.95    4
12    NaN  680.0  3.27    2
13    1.0  560.0  3.59    2
14    0.0  700.0  3.65    2
15    0.0  520.0  2.98    2
16    0.0  700.0  3.92    2
17    1.0  620.0  4.00    x
18    0.0  640.0  3.51    2
19    1.0  600.0  3.58    1

    admit     gre   gpa rank
0     1.0   400.0  3.23    4
1     1.0   700.0  3.56    1
2     1.0  1000.0  4.00    2
3     0.0   500.0  3.53    4
4     0.0   560.0  3.78    2
5     0.0     NaN  3.35  NaN
6     1.0   520.0   NaN    3
7     0.0   440.0  3.17    2
8     1.0   760.0  3.00    2
9     NaN   600.0  2.82    4
10    1.0   500.0  3.60    3
11    0.0   500.0  3.95    4
12    NaN   680.0  3.27    2
13   

## Clean out RANK column

In [12]:

admissions3 = admissions[admissions['rank'].isin(['1','2','3','4'])]
admissions3

Unnamed: 0,admit,gre,gpa,rank
0,1.0,400.0,3.23,4
1,1.0,700.0,3.56,1
2,1.0,800.0,4.0,2
3,0.0,500.0,3.53,4
4,0.0,560.0,3.78,2
6,1.0,520.0,,3
7,0.0,440.0,3.17,2
8,1.0,760.0,3.0,2
9,,600.0,2.82,4
10,1.0,500.0,3.6,3


## Converting Types

Rank column is still an object.  Let's convert it to an INT type

In [13]:
admissions3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 0 to 19
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   16 non-null     float64
 1   gre     18 non-null     float64
 2   gpa     17 non-null     float64
 3   rank    18 non-null     object 
dtypes: float64(3), object(1)
memory usage: 720.0+ bytes
