#     Missing Data

https://www.tutorialspoint.com/python_pandas/python_pandas_missing_data.htm

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
print(df,'\n')
# Using reindexing,  create  a DataFrame with missing values
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)

        one       two     three
a -0.917498 -0.756065  0.046795
c -2.055468 -0.227046  0.047230
e  2.841160  0.776425  1.386045
f -1.401719  0.185826 -0.635055
h -1.004385 -1.651094  0.293613 

        one       two     three
a -0.917498 -0.756065  0.046795
b       NaN       NaN       NaN
c -2.055468 -0.227046  0.047230
d       NaN       NaN       NaN
e  2.841160  0.776425  1.386045
f -1.401719  0.185826 -0.635055
g       NaN       NaN       NaN
h -1.004385 -1.651094  0.293613


In [3]:
print (df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [4]:
 print (df.loc['a']>0)

one      False
two      False
three     True
Name: a, dtype: bool


In [5]:
print (df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [6]:

print (df['one'].sum())

-2.5379108416548286


In [7]:
 #   When summing data, NA will be treated as Zero
 #   If the data are all NA, then the result will be NA
print (df.loc['b'].sum )

<bound method Series.sum of one     NaN
two     NaN
three   NaN
Name: b, dtype: float64>


# Cleaning / Filling Missing Data

Pandas provides various methods for cleaning the missing values. 

The fillna function can “fill in” NA values with non-null data in a couple of ways. 

Fill NA Forward and Backward

Using the concepts of filling discussed in the ReIndexing Chapter we will fill the missing values.

 
1. 	 pad/fill: Fill methods Forward
2. 	 bfill/backfill: Fill methods Backward

In [8]:
 df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one',
'two', 'three'])
print (df,'\n')
df = df.reindex(['a', 'b', 'c'])
print (df,'\n')
print ("NaN replaced with '0':")
print (df.fillna(0))

        one       two     three
a  0.211518 -1.125271  1.124714
c  0.198795  0.751502 -1.003588
e -0.075590  0.646696  0.160152 

        one       two     three
a  0.211518 -1.125271  1.124714
b       NaN       NaN       NaN
c  0.198795  0.751502 -1.003588 

NaN replaced with '0':
        one       two     three
a  0.211518 -1.125271  1.124714
b  0.000000  0.000000  0.000000
c  0.198795  0.751502 -1.003588


In [9]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
print (df,'\n')
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df,'\n')
print (df.fillna(method='pad') )

        one       two     three
a -0.836439  1.035451 -1.445191
c -0.532154 -0.157440 -0.348806
e  2.113150 -0.805061  1.633832
f -0.054621 -0.132664 -0.766857
h -1.927972 -1.100825  0.283997 

        one       two     three
a -0.836439  1.035451 -1.445191
b       NaN       NaN       NaN
c -0.532154 -0.157440 -0.348806
d       NaN       NaN       NaN
e  2.113150 -0.805061  1.633832
f -0.054621 -0.132664 -0.766857
g       NaN       NaN       NaN
h -1.927972 -1.100825  0.283997 

        one       two     three
a -0.836439  1.035451 -1.445191
b -0.836439  1.035451 -1.445191
c -0.532154 -0.157440 -0.348806
d -0.532154 -0.157440 -0.348806
e  2.113150 -0.805061  1.633832
f -0.054621 -0.132664 -0.766857
g -0.054621 -0.132664 -0.766857
h -1.927972 -1.100825  0.283997


In [10]:
 df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
print (df )
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df.fillna(method='backfill'))

        one       two     three
a  1.731050  0.294265 -1.675840
c  0.146366 -0.003691  0.450147
e -1.531766 -1.503317  0.714540
f -0.638858  0.147345  0.295422
h  1.180059 -0.062120  0.250216
        one       two     three
a  1.731050  0.294265 -1.675840
b  0.146366 -0.003691  0.450147
c  0.146366 -0.003691  0.450147
d -1.531766 -1.503317  0.714540
e -1.531766 -1.503317  0.714540
f -0.638858  0.147345  0.295422
g  1.180059 -0.062120  0.250216
h  1.180059 -0.062120  0.250216


# Drop Missing Values

If you want to simply exclude the missing values, then use the dropna function along with the axis argument. By default, axis=0, i.e., along row, which means that if any value within a row is NA then the whole row is excluded.

In [11]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
print(df)
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print (df.dropna())

        one       two     three
a -0.972392  1.002977 -0.606287
c  0.458623  0.897217 -0.405844
e -0.785076 -1.286895  1.208165
f -1.502353  0.618487 -0.771985
h -0.102073  1.056875  1.451390
        one       two     three
a -0.972392  1.002977 -0.606287
b       NaN       NaN       NaN
c  0.458623  0.897217 -0.405844
d       NaN       NaN       NaN
e -0.785076 -1.286895  1.208165
f -1.502353  0.618487 -0.771985
g       NaN       NaN       NaN
h -0.102073  1.056875  1.451390
        one       two     three
a -0.972392  1.002977 -0.606287
c  0.458623  0.897217 -0.405844
e -0.785076 -1.286895  1.208165
f -1.502353  0.618487 -0.771985
h -0.102073  1.056875  1.451390


In [12]:
 df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)
print (df.dropna(axis=1))

        one       two     three
a -0.407357  1.069346  0.841569
b       NaN       NaN       NaN
c  0.791218 -0.930454 -0.247777
d       NaN       NaN       NaN
e -0.413518  0.509508 -0.015421
f  0.227857 -0.262540 -0.604816
g       NaN       NaN       NaN
h -0.839743 -0.791146 -0.846235
Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


In [13]:
#replace a generic value with some specific value
df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})
print(df)
print (df.replace({1000:10,2000:60}) )

    one   two
0    10  1000
1    20     0
2    30    30
3    40    40
4    50    50
5  2000    60
   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60


# GroupBy
https://www.tutorialspoint.com/python_pandas/python_pandas_groupby.htm

Any groupby operation involves one of the following operations on the original object. 

   1  Splitting the Object

   2  Applying a function

   3  Combining the results

In many situations, we split the data into sets and we apply some functionality on each subset.
In the apply functionality, we can perform the following operations −

    1 Aggregation − computing a summary statistic

    2 Transformation − perform some group-specific operation

    3 Filtration − discarding the data with some condition

In [14]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
print (df )

      Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
2   Devils     2  2014     863
3   Devils     3  2015     673
4    Kings     3  2014     741
5    kings     4  2015     812
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016     694
9   Royals     4  2014     701
10  Royals     1  2015     804
11  Riders     2  2017     690


In [15]:
print (df.groupby('Team'),'\n')
print (df.groupby('Team').groups,'\n')
print (df.groupby(['Team','Year']).groups,'\n')#Group by with multiple columns
#print (df.groupby(['Team','Year']).groups('Royals', 2014),'\n')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000289F69A02B0> 

{'Devils': Int64Index([2, 3], dtype='int64'), 'Kings': Int64Index([4, 6, 7], dtype='int64'), 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'), 'Royals': Int64Index([9, 10], dtype='int64'), 'kings': Int64Index([5], dtype='int64')} 

{('Devils', 2014): Int64Index([2], dtype='int64'), ('Devils', 2015): Int64Index([3], dtype='int64'), ('Kings', 2014): Int64Index([4], dtype='int64'), ('Kings', 2016): Int64Index([6], dtype='int64'), ('Kings', 2017): Int64Index([7], dtype='int64'), ('Riders', 2014): Int64Index([0], dtype='int64'), ('Riders', 2015): Int64Index([1], dtype='int64'), ('Riders', 2016): Int64Index([8], dtype='int64'), ('Riders', 2017): Int64Index([11], dtype='int64'), ('Royals', 2014): Int64Index([9], dtype='int64'), ('Royals', 2015): Int64Index([10], dtype='int64'), ('kings', 2015): Int64Index([5], dtype='int64')} 



# Iterating through Groups

With the groupby object in hand, we can iterate through the object similar to itertools.obj.

In [16]:
grouped = df.groupby('Year')
i=1;
print(grouped)
for n,g in grouped:
  print('[',i,']:  ',n)
  i=i+1
  
  print (g,' \n')
print (" \n \n grouped.get_group(2017) \n----------------------\n",grouped.get_group(2017))    #select a single group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000289F6A10438>
[ 1 ]:   2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701  

[ 2 ]:   2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804  

[ 3 ]:   2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694  

[ 4 ]:   2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690  

 
 
 grouped.get_group(2017) 
----------------------
       Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690


# Aggregations

An aggregated function returns a single aggregated value for each group. 

Once the group by object is created, several aggregation operations can be performed on the grouped data.


In [17]:
grouped = df.groupby('Year')
print (grouped['Points'].agg(np.mean) )# mean of points for  the year
print (grouped.agg(np.size)) #size of each group i

Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64
      Team  Rank  Points
Year                    
2014     4     4       4
2015     4     4       4
2016     2     2       2
2017     2     2       2


In [18]:
grouped = df.groupby('Team')
print (grouped['Points'].agg([np.sum, np.mean, np.std]))

         sum        mean         std
Team                                
Devils  1536  768.000000  134.350288
Kings   2285  761.666667   24.006943
Riders  3049  762.250000   88.567771
Royals  1505  752.500000   72.831998
kings    812  812.000000         NaN


# Transformations

Transformation on a group or a column returns an object that is indexed the same size of that is being grouped. Thus, the transform should return a result that is the same size as that of a group chunk.

In [19]:
grouped = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
print (grouped.transform(score))

         Rank       Year     Points
0  -15.000000 -11.618950  12.843272
1    5.000000  -3.872983   3.020286
2   -7.071068  -7.071068   7.071068
3    7.071068   7.071068  -7.071068
4   11.547005 -10.910895  -8.608621
5         NaN        NaN        NaN
6   -5.773503   2.182179  -2.360428
7   -5.773503   8.728716  10.969049
8    5.000000   3.872983  -7.705963
9    7.071068  -7.071068  -7.071068
10  -7.071068   7.071068   7.071068
11   5.000000  11.618950  -8.157595


# Filtration

Filtration filters the data on a defined criteria and returns the subset of data. 

In [20]:
#I  return the teams which have participated three or more times in IPL.

print (df.groupby('Team').filter(lambda x: len(x) >= 3))

      Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
4    Kings     3  2014     741
6    Kings     1  2016     756
7    Kings     1  2017     788
8   Riders     2  2016     694
11  Riders     2  2017     690


# Merging/Joining

https://www.tutorialspoint.com/python_pandas/python_pandas_merging_joining.htm
 
<b>pd.merge( left, right,  how='inner',  on=None,  left_on=None,  right_on=None,
left_index=False,  right_index=False,  sort=True)</b>

<b>left −</b> A DataFrame object.

<b>right −</b> Another DataFrame object.

<b>on − </b>Columns (names) to join on. Must be found in both the left and right DataFrame objects.
left_on −</b> Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.

<b>right_on − </b>Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.

<b>left_index − </b>If True, use the index (row labels) from the left DataFrame as its join key(s). In case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys from the right DataFrame.

<b>right_index − </b>Same usage as left_index for the right DataFrame.

<b>how −</b> One of 'left', 'right', 'outer', 'inner'. Defaults to inner. Each method has been described below.

<b>sort − </b>Sort the result DataFrame by the join keys in lexicographical order. Defaults to True, setting to False will improve the performance substantially in many cases.


In [21]:
left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
   {'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})
print (left,'\n\n\n',  right)

   id    Name subject_id
0   1    Alex       sub1
1   2     Amy       sub2
2   3   Allen       sub4
3   4   Alice       sub6
4   5  Ayoung       sub5 


    id   Name subject_id
0   1  Billy       sub2
1   2  Brian       sub4
2   3   Bran       sub3
3   4  Bryce       sub6
4   5  Betty       sub5


In [22]:
#Merge Two DataFrames on a 
print (pd.merge(left,right,on='id'))

   id  Name_x subject_id_x Name_y subject_id_y
0   1    Alex         sub1  Billy         sub2
1   2     Amy         sub2  Brian         sub4
2   3   Allen         sub4   Bran         sub3
3   4   Alice         sub6  Bryce         sub6
4   5  Ayoung         sub5  Betty         sub5


In [23]:
# Merge Two DataFrames on Multiple Keys
print (pd.merge(left,right,on=['id','subject_id']))

   id  Name_x subject_id Name_y
0   4   Alice       sub6  Bryce
1   5  Ayoung       sub5  Betty


# Merge Using 'how' Argument

The how argument to merge specifies how to determine which keys are to be included in the resulting table. If a key combination does not appear in either the left or the right tables, the values in the joined table will be NA.

Here is a summary of the how options and their SQL equivalent names  

<b>Merge Method 	SQL Equivalent 	      Description</b>

   left 	        LEFT OUTER JOIN 	Use keys from left object
   
right       	RIGHT OUTER JOIN 	Use keys from right object

outer       	FULL OUTER JOIN 	Use union of keys

inner       	INNER JOIN 	Use     intersection of keys

