# How to deal with missing data 

In [36]:
%autosave 0

Autosave disabled


In [37]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Load the data

In [38]:
import pandas as pd
df = pd.read_csv("winemag-data-130k.csv", index_col=0)

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20)

In [40]:
df.shape

(129971, 13)

In [41]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [42]:
df.memory_usage(index=True)

Index                    1039768
country                  1039768
description              1039768
designation              1039768
points                   1039768
price                    1039768
province                 1039768
region_1                 1039768
region_2                 1039768
taster_name              1039768
taster_twitter_handle    1039768
title                    1039768
variety                  1039768
winery                   1039768
dtype: int64

In [43]:
df.memory_usage()
# default index=True

Index                    1039768
country                  1039768
description              1039768
designation              1039768
points                   1039768
price                    1039768
province                 1039768
region_1                 1039768
region_2                 1039768
taster_name              1039768
taster_twitter_handle    1039768
title                    1039768
variety                  1039768
winery                   1039768
dtype: int64

In [44]:
df.memory_usage(index=True).sum()

14556752

In [45]:
ls -als wine*


103344 -rw-r--r--@ 1 zhujunlan  staff  52908706 Oct 25 00:44 winemag-data-130k.csv


## Missing data

Entries missing values are given the value **NaN**, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.

*pandas* provides some methods specific to missing data. To select NaN entreis you can use **isna()**( or **isnull()**) (or its companion notna()(notnull()).

<b>isna()</b>: Return a boolean same-sized object indicating if the values are NA

In [46]:
df.isna()[0:10]
# nan 不是string，或者int, 是special type -float64 dtype

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,False,False,False,False,True,False,False,True,False,False,False,False,False
1,False,False,False,False,False,False,True,True,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,True,False,False,False,False,False
6,False,False,False,False,False,False,False,True,False,False,False,False,False
7,False,False,True,False,False,False,False,True,False,False,False,False,False
8,False,False,False,False,False,False,True,True,False,True,False,False,False
9,False,False,False,False,False,False,False,True,False,False,False,False,False


Use **.any()** to return whether any element is True over requested axis

In [47]:
df.isna().any()

country                   True
description              False
designation               True
points                   False
price                     True
province                  True
region_1                  True
region_2                  True
taster_name               True
taster_twitter_handle     True
title                    False
variety                   True
winery                   False
dtype: bool

use **.sum()** to get the sum of the Nan values for the requested axis

In [48]:
df.isna().sum()
# because when we do sum the values, if there is no nan, the sum()should be 0, so it's better to delete 63 nan data in
# country
# region_2 有效的信息少于一半，所以删除，
# 更好的操作方法是看公司的guideline, 知道哪些重要，不可以删除

country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

use **.sum().sum()** to get the total Nan in dataframe

In [49]:
df.isna().sum().sum()
# total na in this dataframe 

204752

## How to include Nan in .groupby ?

Include Nan value, find all the *taster_twitter_handle* and sort them ascending. 

In [50]:
df.taster_twitter_handle.isna().sum()

31213

In [51]:
len(df.taster_twitter_handle.unique())

16

In [52]:
df.groupby('taster_twitter_handle')['taster_twitter_handle'].count().sort_values()
# NAN group自动被删除了用groupby 的时候
# 需要先把NAN转化成string

taster_twitter_handle
@winewchristina         6
@bkfiona               27
@worldwineguys       1005
@suskostrzewa        1085
@laurbuzz            1835
@AnneInVino          3685
@gordone_cellars     4177
@wawinereport        4966
@JoeCz               5147
@mattkettmann        6332
@paulgwine           9532
@vboone              9537
@kerinokeefe        10776
@wineschach         15134
@vossroger          25514
Name: taster_twitter_handle, dtype: int64

Nan groups in GroupBy are automatically excluded ... if need to keep Nan as a group, use .astype(str)...

In [53]:
df.astype(str).groupby('taster_twitter_handle')\
    ['taster_twitter_handle'].count().sort_values()
# df.astype(str)-将所有的values都转成string 

taster_twitter_handle
@winewchristina         6
@bkfiona               27
@worldwineguys       1005
@suskostrzewa        1085
@laurbuzz            1835
@AnneInVino          3685
@gordone_cellars     4177
@wawinereport        4966
@JoeCz               5147
@mattkettmann        6332
@paulgwine           9532
@vboone              9537
@kerinokeefe        10776
@wineschach         15134
@vossroger          25514
nan                 31213
Name: taster_twitter_handle, dtype: int64

## How to deal with Nan?

## fillna()

Replacing missing values is a common operation.  *pandas* provides a really handy method for this problem: **fillna()**. fillna provides a few different strategies for mitigating such data. 

### Example 1, replace region_1 each NaN with an  "Unknown":

replace NaN in region_1 with "Unknown"

In [54]:
df.region_1.fillna("Unknown",inplace=True)

In [55]:
df.region_1[:5]

0                   Etna
1                Unknown
2      Willamette Valley
3    Lake Michigan Shore
4      Willamette Valley
Name: region_1, dtype: object

### Example 2, replace the NaN in 'price' with price's average:

In [56]:
df.price.mean()

35.363389129985535

In [57]:
df.price.fillna(value=35, inplace=True)
# value = ... 可以不写， 是default，可以是int, float, string ..不能是list

In [58]:
df.price.isna().any()

False

## dropna()

In [59]:
df2=df.copy()

In [60]:
df.shape

(129971, 13)

In [61]:
df2 = df2.dropna()
# drop 整个row 只要有一个nan 在这一行
# 不要用这个，除非nan特别少的时候

In [62]:
df2.shape

(22524, 13)

In [63]:
df2.isna().any()

country                  False
description              False
designation              False
points                   False
price                    False
province                 False
region_1                 False
region_2                 False
taster_name              False
taster_twitter_handle    False
title                    False
variety                  False
winery                   False
dtype: bool

The above operations dropped 83% of data, not a good idea ..

### Example 3, drop the rows with country = NaN :

In [64]:
# 更好的方法是 specify 哪个column如果有nan 再drop
df.shape

(129971, 13)

In [65]:
df.country.isna().sum()

63

In [66]:
# verify before and after drop any 

Let's figure out where is the first country with country with NaN

In [67]:
df[df.country.isna()==True].head(1)
# find the first find nan 
#！！！！  still a view 所以不能用.iloc[915:925],或者any index  !!!

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
913,,"Amber in color, this wine has aromas of peach ...",Asureti Valley,87,30.0,,Unknown,,Mike DeSimone,@worldwineguys,Gotsa Family Wines 2014 Asureti Valley Chinuri,Chinuri,Gotsa Family Wines


In [68]:
df.dropna(how='any',subset=['country'])[912:915]
# default axis=0 是drop row, 也可以drop column， axis=1
# how=‘any' - 只要有nan 这个一行就drop 
# how='all' - 所有都是nan才drop
# subset -  array like(所以就算只有一个也用[]括起来！！！)！！！
# 913 被drop掉了

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
912,US,"This light, refreshing rosé mixes fresh strawb...",Three Otters Pinot Noir,87,18.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Fullerton 2015 Three Otters Pinot Noir Rosé (W...,Rosé,Fullerton
914,Argentina,Smashed berry aromas are backed by earth and t...,Grand Reserve,87,20.0,Other,San Juan,,Michael Schachner,@wineschach,Graffigna 2012 Grand Reserve Malbec (San Juan),Malbec,Graffigna
915,US,"This is smooth and accessible, putting a light...",Atração,87,25.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Coelho 2014 Atração Pinot Noir (Willamette Val...,Pinot Noir,Coelho


In [69]:
df.dropna(how='any',subset=['country']，inplace=True)

SyntaxError: invalid character in identifier (<ipython-input-69-e00010115eb7>, line 1)

In [70]:
df.shape

(129971, 13)

In [71]:
df.country.isna().sum()

63

### Example 4, drop based on threshold(number of non-NaN)

Drop **column(s)** has more than 50% NaN. (*require at least 65,000 non-NaN*)

In [72]:
df.shape # 没有（）

(129971, 13)

In [73]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                        0
province                    63
region_1                     0
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [74]:
df.dropna(thresh=65000, axis=1, inplace=True)
# axis=1 drop column, 保证所有row 不变

In [75]:
df.shape

(129971, 12)

In [76]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                        0
province                    63
region_1                     0
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

Above operations cause region_2 got dropped.

In [77]:
df.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,35.0,Sicily & Sardinia,Etna,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia


## backfill/ffill 

Or we could fill each NaN with the first non-NaN value that appears sometime after/before the given record in the database. This is known as the backfill/ffill strategy:

Fill the NaN in 'taster_name' with the first non-null value that appears after the given record.

In [78]:
df.taster_name.isna().sum()

26244

In [79]:
df[df.taster_name.isna()==True].head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
31,Italy,Merlot and Nero d'Avola form the base for this...,Calanìca Nero d'Avola-Merlot,86,35.0,Sicily & Sardinia,Sicilia,,,Duca di Salaparuta 2010 Calanìca Nero d'Avola-...,Red Blend,Duca di Salaparuta
32,Italy,"Part of the extended Calanìca series, this Gri...",Calanìca Grillo-Viognier,86,35.0,Sicily & Sardinia,Sicilia,,,Duca di Salaparuta 2011 Calanìca Grillo-Viogni...,White Blend,Duca di Salaparuta
33,US,"Rustic and dry, this has flavors of berries, c...",Puma Springs Vineyard,86,50.0,California,Dry Creek Valley,,,Envolve 2010 Puma Springs Vineyard Red (Dry Cr...,Red Blend,Envolve
34,US,"This shows a tart, green gooseberry flavor tha...",,86,20.0,California,Sonoma Valley,,,Envolve 2011 Sauvignon Blanc (Sonoma Valley),Sauvignon Blanc,Envolve
37,Italy,This concentrated Cabernet offers aromas of cu...,Missoni,86,21.0,Sicily & Sardinia,Sicilia,,,Feudi del Pisciotto 2010 Missoni Cabernet Sauv...,Cabernet Sauvignon,Feudi del Pisciotto


In [80]:
df.taster_name.iloc[30:36]

30      Roger Voss
31             NaN
32             NaN
33             NaN
34             NaN
35    Paul Gregutt
Name: taster_name, dtype: object

### method='backfill'

In [81]:
df.taster_name.fillna(method='backfill').iloc[30:36]

30      Roger Voss
31    Paul Gregutt
32    Paul Gregutt
33    Paul Gregutt
34    Paul Gregutt
35    Paul Gregutt
Name: taster_name, dtype: object

### method='ffill'

In [82]:
df.taster_name.fillna(method='ffill').iloc[30:36]

30      Roger Voss
31      Roger Voss
32      Roger Voss
33      Roger Voss
34      Roger Voss
35    Paul Gregutt
Name: taster_name, dtype: object

In [83]:
df.taster_name.fillna(method='backfill',inplace=True)

In [84]:
df.taster_name.isna().any()

False

In [85]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                        0
province                    63
region_1                     0
taster_name                  0
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

## Problems:

We want to clean up the rest of this data set based on following guidelines:

1, change all the NaN in 'taster_twitter_handle' to "@anonymous". 

2, change all the NaN in 'designation' to 'Unknown'.

3, drop the row with 'variety' = NaN

3, since this dataset was published, reviewer Kerin O'Keefe has changed her Twitter handle from @kerinokeefe to @kerino. 

In [86]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                        0
province                    63
region_1                     0
taster_name                  0
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [87]:
df.designation.fillna(value='Unknown',inplace=True)

verify the 'Unknown' count equal to previous 'designation' count

In [88]:
(df.designation=='Unknown').sum()

37465

In [89]:
df.taster_twitter_handle.fillna(value='@anonymous',inplace=True)

verify the '@anonymous' count equal to previous 'taster_twitter_handle' count

In [90]:
(df.taster_twitter_handle=='@anonymous').sum()

31213

In [91]:
df.dropna(how='any',subset=['variety'],inplace=True)

verify no NaN in 'variety'

In [92]:
df.variety.isna().any()

False

In [93]:
# replace the value !!!
df.taster_twitter_handle.replace(to_replace='@kerinokeefe',\
                                 value='@kerino',inplace=True)
# to_replace 是被换的
# value - 是新的

In [94]:
df.isna().sum()

country                  63
description               0
designation               0
points                    0
price                     0
province                 63
region_1                  0
taster_name               0
taster_twitter_handle     0
title                     0
variety                   0
winery                    0
dtype: int64

Final DataFrame shape

In [95]:
df.shape

(129970, 12)

# Summary functions and maps reference


### pandas provides many simple "summary functions" which restructure the data in some useful way. 

## .describe()

In [96]:
df.describe()

Unnamed: 0,points,price
count,129970.0,129970.0
mean,88.447142,35.338378
std,3.039742,39.577293
min,80.0,4.0
25%,86.0,18.0
50%,88.0,28.0
75%,91.0,40.0
max,100.0,3300.0


In [97]:
df.points.describe()

count    129970.000000
mean         88.447142
std           3.039742
min          80.000000
25%          86.000000
50%          88.000000
75%          91.000000
max         100.000000
Name: points, dtype: float64

This method generates a high-level summary of the attributes of the given column. It is type-aware, meaning that its output changes based on the dtype of the input. The output above only makes sense for numerical data; for string data here's what we get:

In [98]:
df.taster_name.describe()
# 只是numeric data才起作用能够算出mean，Std等等

count         129970
unique            19
top       Roger Voss
freq           32816
Name: taster_name, dtype: object

To see a list of unique values we can use the **unique** function:

In [99]:
df.taster_name.unique()

array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

To see a list of unique values and how often they occur in the dataset, we can use the **value_counts** method:

In [100]:
df.taster_name.value_counts()
# 也可以用groupby

Roger Voss            32816
Michael Schachner     21575
Paul Gregutt          14124
Kerin O’Keefe         11475
Virginie Boone        10866
Joe Czerwinski         7858
Matt Kettmann          6444
Anna Lee C. Iijima     5305
Sean P. Sullivan       5285
Jim Gordon             4244
Anne Krebiehl MW       3844
Lauren Buzzeo          2604
Susan Kostrzewa        1859
Mike DeSimone           546
Jeff Jenssen            520
Alexander Peartree      419
Carrie Dykes            146
Fiona Adams              27
Christina Pickard        13
Name: taster_name, dtype: int64

In [101]:
df.taster_name.count()

129970

In [102]:
(df.taster_name=='Roger Voss').sum()
#！！！！不用len(df[df.taster_name == 'Roger Voss'])

32816

In [103]:
df.groupby('taster_name')['taster_name'].count().nlargest(20)

taster_name
Roger Voss            32816
Michael Schachner     21575
Paul Gregutt          14124
Kerin O’Keefe         11475
Virginie Boone        10866
Joe Czerwinski         7858
Matt Kettmann          6444
Anna Lee C. Iijima     5305
Sean P. Sullivan       5285
Jim Gordon             4244
Anne Krebiehl MW       3844
Lauren Buzzeo          2604
Susan Kostrzewa        1859
Mike DeSimone           546
Jeff Jenssen            520
Alexander Peartree      419
Carrie Dykes            146
Fiona Adams              27
Christina Pickard        13
Name: taster_name, dtype: int64

## Note: How to read Microsoft Excel format file

Find the top 3 correlations based on all the data in this excel file.

In [104]:
xl =pd.ExcelFile('Cancer_Cardio.xlsx')

In [105]:
type(xl)
xl

pandas.io.excel.ExcelFile

<pandas.io.excel.ExcelFile at 0x1299b8eb8>

In [106]:
xl.sheet_names

['Cancer', 'Cardio', 'Smoking']

In [107]:
df1 = xl.parse("Cancer")
df1
# use parse to create dataframe

Unnamed: 0,city,Geocode,cancer
0,Detroit,505,44
1,Ft. Wayne,731,37
2,Pittsburgh,600,50
3,Detroit,507,46
4,Pittsburgh,621,54
5,Ft. Wayne,728,41


In [108]:
df2 = xl.parse("Cardio")
df2

Unnamed: 0,Geocode,cardiovascular
0,505,21
1,731,19
2,600,28
3,507,25
4,621,25
5,728,30


In [109]:
df3 = xl.parse("Smoking")
df3

Unnamed: 0,Geocode,Smoking rate
0,505,25
1,731,31
2,600,33
3,507,27
4,621,34
5,728,36


In [110]:
df=df1.merge(df2,on='Geocode')

In [111]:
df

Unnamed: 0,city,Geocode,cancer,cardiovascular
0,Detroit,505,44,21
1,Ft. Wayne,731,37,19
2,Pittsburgh,600,50,28
3,Detroit,507,46,25
4,Pittsburgh,621,54,25
5,Ft. Wayne,728,41,30


In [112]:
df = df.merge(df3,on='Geocode')
# ==  left_on= , right_on= 

In [113]:
df

Unnamed: 0,city,Geocode,cancer,cardiovascular,Smoking rate
0,Detroit,505,44,21,25
1,Ft. Wayne,731,37,19,31
2,Pittsburgh,600,50,28,33
3,Detroit,507,46,25,27
4,Pittsburgh,621,54,25,34
5,Ft. Wayne,728,41,30,36


In [114]:
df.groupby('city')['cancer'].sum().sort_values(ascending=False)

city
Pittsburgh    104
Detroit        90
Ft. Wayne      78
Name: cancer, dtype: int64

In [115]:
df.corr()
#!!! correlation 

Unnamed: 0,Geocode,cancer,cardiovascular,Smoking rate
Geocode,1.0,-0.456196,0.127813,0.773534
cancer,-0.456196,1.0,0.377006,0.146327
cardiovascular,0.127813,0.377006,1.0,0.627599
Smoking rate,0.773534,0.146327,0.627599,1.0


In [116]:
cor=df.corr()

In [117]:
cor[cor<1].stack()

Geocode         cancer           -0.456196
                cardiovascular    0.127813
                Smoking rate      0.773534
cancer          Geocode          -0.456196
                cardiovascular    0.377006
                Smoking rate      0.146327
cardiovascular  Geocode           0.127813
                cancer            0.377006
                Smoking rate      0.627599
Smoking rate    Geocode           0.773534
                cancer            0.146327
                cardiovascular    0.627599
dtype: float64

In [118]:
cor[cor<1].stack().nlargest(6)

Geocode         Smoking rate      0.773534
Smoking rate    Geocode           0.773534
cardiovascular  Smoking rate      0.627599
Smoking rate    cardiovascular    0.627599
cancer          cardiovascular    0.377006
cardiovascular  cancer            0.377006
dtype: float64

In [119]:
cor[cor<1].stack().nlargest(6)[::2]
# defualt step is 1

Geocode         Smoking rate      0.773534
cardiovascular  Smoking rate      0.627599
cancer          cardiovascular    0.377006
dtype: float64