# Handling Missing Data

We would look at a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'X':[19,26,np.nan],
                  'Y':[53,np.nan,np.nan],
                  'Z':[11,20,36]})

In [3]:
df

Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,,20
2,,,36


In [None]:
df.shape

In [4]:
# The dropna method removes rows or columns containing np.nan
df.dropna(axis = 0)

Unnamed: 0,X,Y,Z
0,19.0,53.0,11


In [None]:
df

In [5]:
# axis is set to 0 by default
df.dropna(axis=1)

Unnamed: 0,Z
0,11
1,20
2,36


In [7]:
#the threshold argument specifies the number of NON 'NAN' values a row or column must contain to remain unremoved 
df.dropna(axis = 1, thresh=2)

Unnamed: 0,X,Z
0,19.0,11
1,26.0,20
2,,36


In [9]:
df.dropna(thresh=2, axis = 1)

Unnamed: 0,X,Z
0,19.0,11
1,26.0,20
2,,36


In [None]:
df.fillna?

In [10]:
# we can use the fillna methos to replace np.nan with another value
df.fillna(value= 60)

Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,60.0,20
2,60.0,60.0,36


In [13]:
df.fillna(value= {"X":30,"Y":45,"Y":50})

Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,50.0,20
2,30.0,50.0,36


In [14]:
df[['X']].mean()

X    22.5
dtype: float64

In [15]:
df[['X']].fillna(df[['X']].mean())

Unnamed: 0,X
0,19.0
1,26.0
2,22.5


In [None]:
df['X'] =df['X'].fillna(value=df['X'].mean())
df

In [18]:
# fillna can also take argument 'method' by which we specify how the nan value should be replaced. ffill and bfill are possible options
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,0.0,20
2,0.0,0.0,36


In [19]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,0.0,20
2,0.0,0.0,36


In [17]:
df.fillna(0, inplace=True)
df

Unnamed: 0,X,Y,Z
0,19.0,53.0,11
1,26.0,0.0,20
2,0.0,0.0,36


# Aggregating Data with Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [20]:
# Create dataframe
import pandas as pd
data = {'Company':['GOOGLE','GOOGLE','MICROSOFT','FACEBOOK','FACEBOOK','AMAZON','AMAZON'],
       'Person':['Sammy','Charlie','Amy','Riley','Bobby','Danny','Freddy'],
       'Sales':[200,200,340,124,243,350,267]}
comp_df= pd.DataFrame(data)

In [21]:
comp_df

Unnamed: 0,Company,Person,Sales
0,GOOGLE,Sammy,200
1,GOOGLE,Charlie,200
2,MICROSOFT,Amy,340
3,FACEBOOK,Riley,124
4,FACEBOOK,Bobby,243
5,AMAZON,Danny,350
6,AMAZON,Freddy,267


In [23]:
comp_df.head(3)

Unnamed: 0,Company,Person,Sales
0,GOOGLE,Sammy,200
1,GOOGLE,Charlie,200
2,MICROSOFT,Amy,340


In [24]:
comp_df.tail(2)

Unnamed: 0,Company,Person,Sales
5,AMAZON,Danny,350
6,AMAZON,Freddy,267


In [22]:
comp_df.describe()

Unnamed: 0,Sales
count,7.0
mean,246.285714
std,80.847004
min,124.0
25%,200.0
50%,243.0
75%,303.5
max,350.0


** Now you can use the .groupby() method to group rows together based on column name. For example let's group the data based on Company. This will create a DataFrameGroupBy object:**

In [25]:
list(comp_df.groupby('Company'))

[('AMAZON',
    Company  Person  Sales
  5  AMAZON   Danny    350
  6  AMAZON  Freddy    267),
 ('FACEBOOK',
      Company Person  Sales
  3  FACEBOOK  Riley    124
  4  FACEBOOK  Bobby    243),
 ('GOOGLE',
    Company   Person  Sales
  0  GOOGLE    Sammy    200
  1  GOOGLE  Charlie    200),
 ('MICROSOFT',
       Company Person  Sales
  2  MICROSOFT    Amy    340)]

In [26]:
by_comp_name = comp_df.groupby("Company")

In [27]:
by_comp_name.groups

{'AMAZON': [5, 6], 'FACEBOOK': [3, 4], 'GOOGLE': [0, 1], 'MICROSOFT': [2]}

In [28]:
by_comp_name.get_group('FACEBOOK')

Unnamed: 0,Company,Person,Sales
3,FACEBOOK,Riley,124
4,FACEBOOK,Bobby,243


In [29]:
by_comp_name

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D3DC3C6A50>

In [30]:
by_comp_name.mean(numeric_only = True)

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
AMAZON,308.5
FACEBOOK,183.5
GOOGLE,200.0
MICROSOFT,340.0


In [31]:
by_comp_name.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
AMAZON,2,2
FACEBOOK,2,2
GOOGLE,2,2
MICROSOFT,1,1


In [32]:
by_comp_name['Person'].count()

Company
AMAZON       2
FACEBOOK     2
GOOGLE       2
MICROSOFT    1
Name: Person, dtype: int64

In [33]:
comp_df.groupby('Company').mean()

TypeError: agg function failed [how->mean,dtype->object]

In [34]:
by_comp_name.sum()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
AMAZON,DannyFreddy,617
FACEBOOK,RileyBobby,367
GOOGLE,SammyCharlie,400
MICROSOFT,Amy,340


In [35]:
by_comp_name.get_group('AMAZON')['Sales'].sum()

617

In [None]:
list(by_comp_name)

In [None]:
by_comp_name.head(2)

In [None]:
by_comp_name

In [None]:
type(comp_df)

In [49]:
comp_series=comp_df.groupby('Company')[['Sales']].min

In [50]:
comp_series

<bound method GroupBy.min of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D3E02835F0>>

In [37]:
by_comp_name.max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
AMAZON,Freddy,350
FACEBOOK,Riley,243
GOOGLE,Sammy,200
MICROSOFT,Amy,340


In [None]:
# the describe method gives us summary statistics of all the numeric columns in our dataset
by_comp_name.describe()

# Merging, Joining, and Concatenating

There are 3 main ways of combining DataFrames together: Merging, Joining and Concatenating. We will discuss these 3 methods with examples.


In [52]:
import pandas as pd

In [53]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [54]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[3, 5, 6, 7]) 

In [58]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[3, 9, 10, 11])

In [55]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [56]:
df2

Unnamed: 0,A,B,C,D
3,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [59]:
df3

Unnamed: 0,A,B,C,D
3,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


## Concatenation

Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use **pd.concat** and pass in a list of DataFrames to concatenate together:

In [60]:
#here we are just glueing 3 separate dataframes together
pd.concat([df1,df2,df3], axis =0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
3,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
3,A8,B8,C8,D8
9,A9,B9,C9,D9


In [61]:
#remember that axis is set to 0 by default
pd.concat([df1,df2,df3],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,A4,B4,C4,D4,A8,B8,C8,D8
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
9,,,,,,,,,A9,B9,C9,D9
10,,,,,,,,,A10,B10,C10,D10
11,,,,,,,,,A11,B11,C11,D11


_____
## Merging DataFrame

In [62]:
#in merging we are combining dataframes based on a key or an index

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})

In [63]:
df1

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [64]:
df2

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K4,C3,D3


In [None]:
pd.merge?

In [65]:
# the on argument specifies the key while the how specifies the method for merging
pd.merge(left= df1,right = df2,how='right',on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K4,,,C3,D3


In [None]:
left_df = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right_df = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [None]:
#we can also merge dataframes on multiple keys
pd.merge(left_df, right_df, on=['key1', 'key2'], how = 'inner')

In [None]:
pd.merge(left_df, right_df, how='outer', on=['key1', 'key2'])

In [None]:
pd.merge(left_df, right_df, how='right', on=['key1', 'key2'])

In [None]:
pd.merge(left_df, right_df, how='left', on=['key1', 'key2'])

## Joining
Joining is a convenient method for combining the columns of two potentially differently-indexed DataFrames into a single result DataFrame.

In [None]:
#the join method cobines columns based on a key

left_df = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right_df = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [None]:
left_df

In [None]:
right_df

In [None]:
left_df.join?

In [None]:
left_df.join(right_df)

In [None]:
left_df.join(right_df, how='right')

There are other data analysis operations you can perform with but dont fall under any particular category

In [None]:
comp_df

In [None]:
# here we are returning all the unique values in sales column
comp_df['Company'].unique()

In [None]:
# we can also return the number of unique values in a column using nunique
comp_df['Company'].nunique()

In [None]:
# we use the value_counts method to know the count of occurence of each unique value in a column
comp_df['Company'].value_counts()

In [None]:
spe_comp_df = comp_df[(comp_df['Sales']>300) ]
spe_comp_df

In [None]:
#Select from DataFrame using criteria from multiple columns
spe_comp_df = comp_df[(comp_df['Sales']>300) & (comp_df['Company']=='AMAZON')] 

In [None]:
spe_comp_df

In [None]:
# we can use the sort_values method to sort alphabetically or to sort in ascending and descending orders
comp_df.sort_values(by='Company')

In [None]:
comp_df.sort_values(by='Company', ascending = False)

In [None]:
comp_df.sort_values(by=[ 'Company', 'Sales'], ascending = False)

In [None]:
comp_df.sort_values(by=[ 'Sales', 'Company'], ascending = False)

# Data Input and Output

We will now consider the code for getting input and output data with pandas. pandas can read a variety of file types using its pd.read_ methods. Let's take a look at the most common data types:

In [None]:
input_df= pd.read_csv('Lemonade.csv')

In [None]:
input_df.head(10)

In [None]:
# here we are saving the csv format of the specified dataframe to path
input_df.to_csv('lemonade_output1.csv',index=False)

In [None]:
new_input_df= pd.read_excel('Lemonade.xlsx')
new_input_df.head()

In [None]:
new_input_df.dropna(axis=1).head()

In [None]:
# pandas read method can also be used to read a table from a website with known URL
html_df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')

In [None]:
html_df[0].head()