In [1]:
import pandas as pd

In [2]:
surveys_df = pd.read_csv('../data/raw/surveys.csv')

In [3]:
surveys_df.dtypes

record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object

In [4]:
pd.isnull(surveys_df['weight']).sum()

3266

In [5]:
surveys_copy = surveys_df.copy()

In [6]:
surveys_copy['weight'] = surveys_copy['weight'].fillna(0)

In [7]:
surveys_df['weight'].mean()

42.672428212991356

In [8]:
surveys_copy.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,0.0
1,2,7,16,1977,3,NL,M,33.0,0.0
2,3,7,16,1977,2,DM,F,37.0,0.0
3,4,7,16,1977,7,DM,M,36.0,0.0
4,5,7,16,1977,3,DM,M,35.0,0.0


In [9]:
surveys_copy['weight'].mean()

38.751976145601844

In [10]:
# We can also fill with the mean value of weight
surveys_copy = surveys_df.copy()
mean_weight = surveys_copy['weight'].mean()
surveys_copy['weight'] = surveys_copy['weight'].fillna(mean_weight)

In [11]:
surveys_copy.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,42.672428
1,2,7,16,1977,3,NL,M,33.0,42.672428
2,3,7,16,1977,2,DM,F,37.0,42.672428
3,4,7,16,1977,7,DM,M,36.0,42.672428
4,5,7,16,1977,3,DM,M,35.0,42.672428


In [12]:
# Now the mean remains the same
surveys_copy['weight'].mean()

42.67242821299182

In [13]:
# drop all rows with missing values
surveys_nona = surveys_df.dropna()
surveys_nona.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
62,63,8,19,1977,3,DM,M,35.0,40.0
63,64,8,19,1977,7,DM,M,37.0,48.0
64,65,8,19,1977,4,DM,F,34.0,29.0
65,66,8,19,1977,4,DM,F,35.0,46.0
66,67,8,19,1977,7,DM,M,35.0,36.0


In [14]:
# We lost about 500 rows
surveys_nona.shape

(30676, 9)

In [15]:
surveys_df.shape

(35549, 9)

In [None]:
# Exercise: count the number of missing values per column of surveys_df

# Hint: .count() gives number of NA observations

In [17]:
# Using .count():
len(surveys_df) - surveys_df.count()

record_id             0
month                 0
day                   0
year                  0
plot_id               0
species_id          763
sex                2511
hindfoot_length    4111
weight             3266
dtype: int64

In [19]:
null_counts = pd.isnull(surveys_df).sum()
null_counts

record_id             0
month                 0
day                   0
year                  0
plot_id               0
species_id          763
sex                2511
hindfoot_length    4111
weight             3266
dtype: int64

In [20]:
# what type is null_counts
type(null_counts)

pandas.core.series.Series

In [21]:
# select one
null_counts.iloc[6]

2511

In [22]:
import os 
# Show which files / folders are in a directory
os.listdir('../data')

['output', 'raw']

In [24]:
# Create a new directory
os.mkdir('../data/output2')

In [25]:
# Which folders do we have now?
os.listdir('../data')

['output', 'output2', 'raw']

In [26]:
# We will save our data
surveys_nona.to_csv('../data/output/surveys_complete.csv', index=False)

In [27]:
# command to remove directory
os.rmdir('../data/output2')