Python and Science - https://github.com/egalli64/pysci

Kaggle Courses - Pandas - https://www.kaggle.com/learn/pandas

Grouping and Sorting - https://www.kaggle.com/code/residentmario/grouping-and-sorting

In [44]:
# Setup /1: only pandas is used here
import pandas as pd

In [None]:
# Setup /2: generate the data frame used for examples

reviews = pd.DataFrame({
    'country': ['Italy', 'Portugal', 'US', 'Italy', 'Canada', 'France', 'Italy','Australia', 'New Zealand'],
    'description': ['A', 'B tropical', 'C', 'D', 'E', 'F', 'E fruity', 'H', 'I'],
    'price': [35, 41, 34, 18, 27, 32, 43, 22, 12], 
    'winery': ['A 21', 'B 34', 'C 54', 'D 21', None, 'F 43', 'G 44', 'H 11', 'I 32'],
    'points': [85, 88, 87, 92, 81, 95, 84, 97, 80],
    'variety': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'],
    'title': ['a a', 'b b', 'c c', 'd d', 'e e', 'f f', 'g g', 'h h', 'i i'],
    'province': ['A', 'AP', 'AUSA', 'B', 'A', 'A', 'A','A', 'ANZ'],
})

reviews.head(10)

# Groupwise analysis

groupby() - group rows in the data frame

In [None]:
# group the rows by the value in country column, then count the rows in each country group
reviews.groupby('country').country.count()

# value_counts() is a shortcut for this commonly used functionality
# reviews.country.value_counts()

In [None]:
# group the rows by points, get the cheapest price for each group
reviews.groupby('points').price.min()

In [None]:
# group by winery, get the title of the first row in each group
reviews.groupby('winery').apply(lambda df: df.title.iloc[0])

In [None]:
# group by country + province, for each group get the row with higher points
reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])

# to get rid of the warning, explicitly set the multi-index
# reviews.groupby(['country', 'province'])[reviews.columns.difference(['country', 'province'])].apply(
#    lambda df: df.loc[df.points.idxmax()])

# alt: se the multi-index in apply
#reviews.groupby(['country', 'province']).apply(
#    lambda df: df.loc[df.points.idxmax()][df.columns.difference(['country', 'province'])]
#)

# alt: country / province as normal columns
# reviews.loc[reviews.groupby(['country', 'province'])['points'].idxmax()]

agg() - let apply functions on the data frame

In [None]:
# group by country, for each group work on the prices, get its number (length), min and max value
reviews.groupby(['country']).price.agg([len, "min", "max"])

Multi-indexes

In [None]:
# grouping for more than one column, generates an index on more levels (MultiIndex)
countries =reviews.groupby(['country', 'province']).description.agg([len])
countries.index

In [None]:
# to get rid of multi-index, converting it to plain columns, just reset the index
countries.reset_index(inplace=True)
countries

# Sorting

In [None]:
countries.sort_values(by='province')

In [None]:
df = countries.sort_values(by='province', ascending=False)
df

In [None]:
df.sort_index(inplace=True)
df

In [None]:
# multi level sorting
df.sort_values(['province', 'country'])
df