# command List

In [7]:
# Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

df = data('mtcars')

## Group by : split - apply- combine
https://pandas.pydata.org/docs/user_guide/groupby.html

-  Splitting the data into groups based on some criteria.
-  Applying a function to each group independently.
-  Combining the results into a data structure.
<br> Out of these, the split step is the most straightforward. In fact, in many situations we may wish to split the data set into groups and do something with those groups. In the apply step, we might wish to do one of the following:
-  Aggregation: compute a summary statistic (or statistics) for each group. Some examples:
-  Compute group sums or means.
-  Compute group sizes / counts.
<br> Transformation: perform some group-specific computations and return a like-indexed object. Some examples:
-  Standardize data (zscore) within a group.
-  Filling NAs within groups with a value derived from each group.
<br> Filtration: discard some groups, according to a group-wise computation that evaluates True or False. Some examples:
-  Discard data that belongs to groups with only a few members.
-  Filter out data based on the group sum or mean.

In [21]:
grouped = df.groupby('gear', axis=0) #axis = 0 (Columns)
grouped.first()
grouped.last()
grouped.sum()
df.groupby(['gear'], sort=True).sum()
df.groupby(['gear']).get_group(3)
df.groupby(['gear']).groups
len(grouped)
grouped['gear']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001BD806D8E50>

df.groupby(by='gear', dropna=True).sum()

for name, group in grouped:
    print(name)
    print(group)

for name, group in df.groupby(['gear', 'am']):
    print(name)
    print(group)

In [25]:
df.groupby(['gear', 'am'], as_index=False).sum()

Unnamed: 0,gear,am,mpg,cyl,disp,hp,drat,wt,qsec,vs,carb
0,3,0,241.6,112,4894.5,2642,46.99,58.389,265.38,3,40
1,4,0,84.2,20,622.7,403,15.45,13.22,80.1,4,12
2,4,1,210.2,36,853.5,671,33.07,18.18,147.48,6,16
3,5,1,106.9,30,1012.4,978,19.58,13.163,78.2,1,22


## build in aggregations
-  any() = Compute whether any of the values in the groups are truthy
-  all() = Compute whether all of the values in the groups are truthy
-  count() = Compute the number of non-NA values in the groups
-  cov() =  Compute the covariance of the groups, 
-  first() = Compute the first occurring value in each group
-  idxmax() = Compute the index of the maximum value in each group
-  idxmin() * = Compute the index of the minimum value in each group
-  last() = Compute the last occurring value in each group
-  max() = Compute the maximum value in each group
-  mean() = Compute the mean of each group
-  median() = Compute the median of each group
-  min() = Compute the minimum value in each group
-  nunique() = Compute the number of unique values in each group
-  prod() = Compute the product of the values in each group
-  quantile() = Compute a given quantile of the values in each group
-  sem() = Compute the standard error of the mean of the values in each group
-  size() = Compute the number of values in each group
-  skew() * = Compute the skew of the values in each group
-  std() = Compute the standard deviation of the values in each group
-  sum() = Compute the sum of the values in each group
-  var() = Compute the variance of the values in each group

## Aggregate Method
The aggregate() method can accept many different types of inputs. This section details using string aliases for various GroupBy methods; other inputs are detailed in the sections below.
Any reduction method that pandas implements can be passed as a string to aggregate(). Users are encouraged to use the shorthand, agg. It will operate as if the corresponding method was called

grouped.aggregate('sum')
grouped.agg('sum')
df.groupby(['gear', 'am'], as_index=False).agg('sum')
df.groupby(['gear', 'am']).agg('sum').reset_index()  #same as above
grouped['mpg'].agg(['sum','mean', 'min', 'max'])
grouped[['mpg','wt']].agg(['sum','mean', 'min', 'max'])
grouped['mpg'].agg(['sum','mean', 'min', 'max']).rename(columns={'sum':'SUMmpg', 'mean':'MEANmpg','min':'MINIMUM','max':'ABC'})
grouped[['mpg','wt']].agg(['sum','mean', 'min', 'max']).rename(columns={'sum':'SUM', 'mean':'MEAN','min':'MIN','max':'MAX'})

#aggregation with User Defined Functions
df.groupby(['gear'])['mpg'].agg(lambda x : x.astype(float).sum())
grouped[['wt','mpg']].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()])
#grouped[['wt','mpg']].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()]).rename(columns={'lambda_0':'Max_Min', 'lambda_1':'Med_Mean'}) 

### named aggregation
grouped.agg(min_mpg=pd.NamedAgg(column="mpg", aggfunc="min"), max_wt=pd.NamedAgg(column="wt", aggfunc="max"),  average_hp=pd.NamedAgg(column="hp", aggfunc="mean"))
grouped.agg(  min_mpg=("mpg", "min"),  max_wt=("wt", "max"), average_hp=("hp", "mean"))
#series
grouped.mpg.agg(min_mpg ='min', max_mpg = 'max')
grouped.agg({"mpg": "sum", "wt": lambda x: np.std(x, ddof=1)})

In [61]:
###  Transformation
grouped = df.groupby("gear")["mpg"]
grouped
grouped.cumsum()
grouped.diff()


Mazda RX4               NaN
Mazda RX4 Wag           0.0
Datsun 710              1.8
Hornet 4 Drive          NaN
Hornet Sportabout      -2.7
Valiant                -0.6
Duster 360             -3.8
Merc 240D               1.6
Merc 230               -1.6
Merc 280               -3.6
Merc 280C              -1.4
Merc 450SE              2.1
Merc 450SL              0.9
Merc 450SLC            -2.1
Cadillac Fleetwood     -4.8
Lincoln Continental     0.0
Chrysler Imperial       4.3
Fiat 128               14.6
Honda Civic            -2.0
Toyota Corolla          3.5
Toyota Corona           6.8
Dodge Challenger       -6.0
AMC Javelin            -0.3
Camaro Z28             -1.9
Pontiac Firebird        5.9
Fiat X1-9              -6.6
Porsche 914-2           NaN
Lotus Europa            4.4
Ford Pantera L        -14.6
Ferrari Dino            3.9
Maserati Bora          -4.7
Volvo 142E             -5.9
Name: mpg, dtype: float64

-  bfill() = Back fill NA values within each group
-  cumcount() = Compute the cumulative count within each group
-  cummax() = Compute the cumulative max within each group
-  cummin() = Compute the cumulative min within each group
-  cumprod() = Compute the cumulative product within each group
-  cumsum() = Compute the cumulative sum within each group
-  diff() = Compute the difference between adjacent values within each group
-  ffill() = Forward fill NA values within each group
-  fillna() = Fill NA values within each group
-  pct_change() = Compute the percent change between adjacent values within each group
-  rank() =  Compute the rank of each value within each group
-  shift() = Shift values up or down within each group

In [70]:
grouped = df.groupby('gear')['mpg']
grouped.describe()
grouped.transform('sum')

Mazda RX4              294.4
Mazda RX4 Wag          294.4
Datsun 710             294.4
Hornet 4 Drive         241.6
Hornet Sportabout      241.6
Valiant                241.6
Duster 360             241.6
Merc 240D              294.4
Merc 230               294.4
Merc 280               294.4
Merc 280C              294.4
Merc 450SE             241.6
Merc 450SL             241.6
Merc 450SLC            241.6
Cadillac Fleetwood     241.6
Lincoln Continental    241.6
Chrysler Imperial      241.6
Fiat 128               294.4
Honda Civic            294.4
Toyota Corolla         294.4
Toyota Corona          241.6
Dodge Challenger       241.6
AMC Javelin            241.6
Camaro Z28             241.6
Pontiac Firebird       241.6
Fiat X1-9              294.4
Porsche 914-2          106.9
Lotus Europa           106.9
Ford Pantera L         106.9
Ferrari Dino           106.9
Maserati Bora          106.9
Volvo 142E             294.4
Name: mpg, dtype: float64

## Pandas Options
https://pandas.pydata.org/docs/user_guide/options.html
get_option()
set_option()
reset_option()
describe_options()
#follow re.search style to match unambigious substrings


In [5]:
#pd.describe_option()

In [1]:
import pandas as pd
pd.options.display.max_rows
pd.options.display.max_rows = 999
pd.options.display.max_rows

999

- display.chop_threshold : float or None  #all float values < x will be displayed as 0
- display.max_colwidth : display.max_colwidth : int or None =  The maximum width in characters of a column in the repr of a pandas data structure. When the column overflows, a "..." placeholder is embedded in the output. A 'None' value means unlimited.
- display.max_rows : int =  If max_rows is exceeded, switch to truncate view. Depending on `large_repr`, objects are either centrally truncated or printed as  a summary view. 'None' value means unlimited.
- display.max_columns : int =  If max_cols is exceeded, switch to truncate view. Depending on  `large_repr`, objects are either centrally truncated or printed as  a summary view. 'None' value means unlimited.
- display.colheader_justify : 'left'/'right' =    Controls the justification of column headers. used by DataFrameFormatter.
- display.date_dayfirst : boolean =     When True, prints and parses dates with the day first, eg 20/01/2005
- display.expand_frame_repr : boolean =  Whether to print out the full DataFrame repr for wide DataFrames across  multiple lines, `max_columns` is still respected, but the output will wrap-around across multiple "pages" if its width exceeds `display.width`.    [default: True] [currently: True]
- display.max_categories : int =  This sets the maximum number of categories pandas should output when printing out a `Categorical` or a Series of dtype "category".   [default: 8] [currently: 8]
- display.max_info_columns : int =    max_info_columns is used in DataFrame.info method to decide if  per column information will be printed.
-  display.min_rows : int =  The numbers of rows to show in a truncated view (when `max_rows` is exceeded). Ignored when `max_rows` is set to None or 0. When set to  None, follows the value of `max_rows`.  [default: 10] [currently: 10]
-  display.precision : int = Floating point output precision in terms of number of places after the   decimal, for regular formatting as well as scientific notation. Similar  to ``precision`` in :meth:`numpy.set_printoptions`.  [default: 6] [currently: 6]
-  display.width : int =  Width of the display in characters. In case python/IPython is running in a terminal this can be set to None and pandas will correctly auto-detect   the width.  Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a  terminal and hence it is not possible to correctly detect the width.  [default: 80] [currently: 80]

# Startup Options
<br> Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default IPython profile can be found at:
-  $IPYTHONDIR/profile_default/startup
<br>An example startup script for pandas is displayed below:
-  import pandas as pd
-  pd.set_option("display.max_rows", 999)
-  pd.set_option("display.precision", 5)