# Cool scripts for pandas

To load, subset, etc.

The dataset used as an example is not open and is not published in this repository.

## Import libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Path to the dataset:

## Load data

In [10]:
%%time
data_path = '..\..\Thesis\Thesis\Data\HHSaleHistory_cleaned_v0.9_GTHA_DA.csv'
data_df = pd.read_csv(data_path)
data_df.drop(["Unnamed: 0", 'index'], axis=1, inplace=True)
print("DataFrame 'data_df' contains {0:,} rows and {1} columns.".format(len(data_df), len(data_df.columns)))

DataFrame 'data_df' contains 6,062,853 rows and 21 columns.
Wall time: 33.1 s


In [11]:
data_df.head()

Unnamed: 0,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,unitno,street_name,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,1805-01-06,180,1805,62,174140492,,,ON,,,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1856-12-27,185,1856,62,174140490,,,ON,,,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,1861-12-02,186,1861,62,174140491,3300.0,,ON,,,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
3,1910-12-03,191,1910,62,174140099,200.0,,ON,,Hwy 53 E,...,,Ancaster,,99.0,-79.978293,43.202851,8685,7504,Hamilton,46208.0
4,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,,Anson,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0


## Subset data

There are different ways of subsetting data in pandas, such as:

1. Use **`'.query'`** method of a DataFrame. 

    * It is **fast**.
    
    * However, it **can't be used to reassign values** in a column in the resulting subset _(SettingWithCopyWarning)._

    * At the same time, it **can be used to _crop_ the original DataFrame** into the subset using the `'inplace=True'` parameter
    
    * It allows using the >, <, ==, != logical operators, and cannot (at least, I currently don't know how) scan a string for a pattern


2. Use **boolean masks**. Can be used to reassign values for subset elements.

    * It is **fast**.
    
    * It **can be used to reassing values** in a column using `'.loc[mask, column_name]'`, no _SettingWithCopyWarning_ will be raised, as reassingnment is done in one subsetting operation, rather than `'data[mask][column]'`
    
    * Similar to `'.query'`, it allows using the >, <, ==, != logical operators, and cannot (at least, I currently don't know how) scan a string for a pattern


3. Use **pandas Series methods**, such as **`'.str.match'`** or **`'.str.contains'`**, to construct boolean masks.

    * It represents a **different mechanism of creating a boolean mask**, rather than a different mechanism of subsetting.
    
    * Therefore, it **can be used  to reassign values** as `'.loc[mask, column_name]'`
    
    * It **can be used to match a string pattern**, with ? and * wildcard symbols
    
    * However, it is **slow** compared to the other subsetting methods

### Displaying rows with missing values

In [None]:
data_df[np.isnan(data_df['consideration_amt'])]

### Using `'.query'` method of the DataFrame

#### Display top 5 rows of the subset

In [23]:
%time data_df.query("municipality == 'Hamilton'").head()

Wall time: 235 ms


Unnamed: 0.1,Unnamed: 0,index,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,0,0,1805-01-06,180,1805,62,174140492,,,ON,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1,1278,1856-12-27,185,1856,62,174140490,,,ON,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,2,1636,1861-12-02,186,1861,62,174140491,3300.0,,ON,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
4,4,72033,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0
5,5,72568,1955-11-09,195,1955,62,174140046,,L9G2J2,ON,...,,Hamilton,,46.0,-79.98066,43.203787,8685,7504,Hamilton,46208.0


#### Cannot reassign values using `'query'` due to _SettingWithCopyWarning_

In [22]:
data_df.query("municipality == 'Hamilton'")['postal_code'] = 'New_postal_code'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### Can use `'query'` to _crop_ the original DataFrame into the subset

In [24]:
print("Number of rows in 'data_df' prior to querying: {0:,}".format(len(data_df)))
data_df.query('municipality == "Hamilton"', inplace=True)
print("\nNumber of rows in 'data_df' after querying: {0:,}".format(len(data_df)))
data_df.head()

Number of rows in 'data_df' prior to querying: 6,062,853

Number of rows in 'data_df' after querying: 385,469


Unnamed: 0.1,Unnamed: 0,index,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,0,0,1805-01-06,180,1805,62,174140492,,,ON,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1,1278,1856-12-27,185,1856,62,174140490,,,ON,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,2,1636,1861-12-02,186,1861,62,174140491,3300.0,,ON,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
4,4,72033,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0
5,5,72568,1955-11-09,195,1955,62,174140046,,L9G2J2,ON,...,,Hamilton,,46.0,-79.98066,43.203787,8685,7504,Hamilton,46208.0


Resetting DataFrame to the original size:

In [25]:
%%time
data_path = '..\..\Thesis\Thesis\Data\HHSaleHistory_cleaned_v0.9_GTHA_DA.csv'
data_df = pd.read_csv(data_path)
print("Done!")

Done!
Wall time: 32.9 s


#### Examples of using boolean masks

* This type of subsetting is very fast and can be used to reassign values in the DataFrame using `'data_df.loc[mask, column_name]'`


* However, it requires matching the full string (or at least I don't know right now how to make it check if a string contains a pattern, or use wildcard symbols ? and *)

In [9]:
%time mask = data_df['municipality'] == 'Hamilton' # or data_df['municipality'] is 'Hamilton'
print("Mask returns {0:,} records.".format(sum(mask)))
data_df[mask].head()

Wall time: 431 ms
Mask returns 385,469 records.


Unnamed: 0.1,Unnamed: 0,index,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,0,0,1805-01-06,180,1805,62,174140492,,,ON,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1,1278,1856-12-27,185,1856,62,174140490,,,ON,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,2,1636,1861-12-02,186,1861,62,174140491,3300.0,,ON,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
4,4,72033,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0
5,5,72568,1955-11-09,195,1955,62,174140046,,L9G2J2,ON,...,,Hamilton,,46.0,-79.98066,43.203787,8685,7504,Hamilton,46208.0


#### Example of using pandas Series method `'.str.match'`

Matches the beginning of the string.

In [10]:
%time mask = data_df['municipality'].str.match('ham', case=False, na=False)
print("Mask returns {0:,} records.".format(sum(mask)))
data_df[mask].head()

Wall time: 4.26 s
Mask returns 385,917 records.


Unnamed: 0.1,Unnamed: 0,index,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,0,0,1805-01-06,180,1805,62,174140492,,,ON,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1,1278,1856-12-27,185,1856,62,174140490,,,ON,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,2,1636,1861-12-02,186,1861,62,174140491,3300.0,,ON,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
4,4,72033,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0
5,5,72568,1955-11-09,195,1955,62,174140046,,L9G2J2,ON,...,,Hamilton,,46.0,-79.98066,43.203787,8685,7504,Hamilton,46208.0


#### Example of using pandas Series method `'.str.contains'` 

Checks if a string contains this pattern.

In [17]:
%time mask = data_df['municipality'].str.contains('amilto', case=False, na=False)
print("Mask returns {0:,} values.".format(sum(mask)))
data_df[mask].head()

Wall time: 4.87 s
Mask returns 386,998 values.


Unnamed: 0.1,Unnamed: 0,index,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,0,0,1805-01-06,180,1805,62,174140492,,,ON,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1,1278,1856-12-27,185,1856,62,174140490,,,ON,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,2,1636,1861-12-02,186,1861,62,174140491,3300.0,,ON,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
4,4,72033,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0
5,5,72568,1955-11-09,195,1955,62,174140046,,L9G2J2,ON,...,,Hamilton,,46.0,-79.98066,43.203787,8685,7504,Hamilton,46208.0


## Modify data

### Load dataset with generic tweets

In [4]:
%%time
path = 'data/generic_tweets.txt'
tweets_df = pd.read_csv(path)
print("DataFrame 'tweets_df' with {0} generic tweets was read from file!")

DataFrame 'tweets_df' with {0} generic tweets was read from file!
Wall time: 1.03 s


## Styling

From [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html)

In [6]:
np.random.seed(24)
df = pd.DataFrame({'A': np.linspace(1, 10, 10)})
df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],
               axis=1)
df.iloc[0, 2] = np.nan

### Built-in styles

Certain styling functions to be common enough that they are included a few “built-in” to the Styler, so you don’t have to write them yourself.

In [18]:
df.style.highlight_null(null_color='red')

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


You can create “heatmaps” with the background_gradient method. These require matplotlib, and we’ll use Seaborn to get a nice colormap.

In [19]:
import seaborn as sns

cm = sns.light_palette("green", as_cmap=True)

s = df.style.background_gradient(cmap=cm)
s

  xa[xa < 0] = -1


Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


Styler.background_gradient takes the keyword arguments low and high. Roughly speaking these extend the range of your data by low and high percent so that when we convert the colors, the colormap’s entire range isn’t used. This is useful so that you can actually read the text still.

In [20]:
# Uses the full color range
df.loc[:4].style.background_gradient(cmap='viridis')

  xa[xa < 0] = -1


Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018


In [21]:
# Compress the color range
(df.loc[:4]
    .style
    .background_gradient(cmap='viridis', low=.5, high=0)
    .highlight_null('red'))

  xa[xa < 0] = -1


Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018


There’s also .highlight_min and .highlight_max.

In [22]:
df.style.highlight_max(axis=0)

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


Use Styler.set_properties when the style doesn’t actually depend on the values.

In [23]:
df.style.set_properties(**{'background-color': 'black',
                           'color': 'lawngreen',
                           'border-color': 'white'})

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


### Building styles

Pass your style functions into one of the following methods:

Styler.applymap: elementwise  

Styler.apply: column-/row-/table-wise


Both of those methods take a function (and some other keyword arguments) and applies your function to the DataFrame in a certain way. Styler.applymap works through the DataFrame elementwise. Styler.apply passes each column or row into your DataFrame one-at-a-time or the entire table at once, depending on the axis keyword argument. For columnwise use axis=0, rowwise use axis=1, and for the entire table at once use axis=None.

For Styler.applymap your function should take a scalar and return a single string with the CSS attribute-value pair.

For Styler.apply your function should take a Series or DataFrame (depending on the axis parameter), and return a Series or DataFrame with an identical shape where each value is a string with a CSS attribute-value pair.

Let’s see some examples.

In [8]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

In [10]:
s = df.style.applymap(color_negative_red)
s

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


In [11]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [12]:
df.style.apply(highlight_max)

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


In [13]:
df.style.\
    applymap(color_negative_red).\
    apply(highlight_max)

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


In [14]:
df.style.format("{:.2%}")

Unnamed: 0,A,B,C,D,E
0,100.00%,132.92%,nan%,-31.63%,-99.08%
1,200.00%,-107.08%,-143.87%,56.44%,29.57%
2,300.00%,-162.64%,21.96%,67.88%,188.93%
3,400.00%,96.15%,10.40%,-48.12%,85.02%
4,500.00%,145.34%,105.77%,16.56%,51.50%
5,600.00%,-133.69%,56.29%,139.29%,-6.33%
6,700.00%,12.17%,120.76%,-0.20%,162.78%
7,800.00%,35.45%,103.75%,-38.57%,51.98%
8,900.00%,168.66%,-132.60%,142.90%,-208.94%
9,1000.00%,-12.98%,63.15%,-58.65%,29.07%


In [15]:
df.style.format({'B': "{:0<4.0f}", 'D': '{:+.2f}'})

Unnamed: 0,A,B,C,D,E
0,1,1000,,-0.32,-0.99081
1,2,-100,-1.43871,0.56,0.295722
2,3,-200,0.219565,0.68,1.88927
3,4,1000,0.104011,-0.48,0.850229
4,5,1000,1.05774,0.17,0.515018
5,6,-100,0.562861,1.39,-0.063328
6,7,0,1.2076,-0.0,1.6278
7,8,0,1.03753,-0.39,0.519818
8,9,2000,-1.32596,1.43,-2.08935
9,10,0,0.631523,-0.59,0.29072


In [16]:
df.style.format({"B": lambda x: "±{:.2f}".format(abs(x))})

Unnamed: 0,A,B,C,D,E
0,1,±1.33,,-0.31628,-0.99081
1,2,±1.07,-1.43871,0.564417,0.295722
2,3,±1.63,0.219565,0.678805,1.88927
3,4,±0.96,0.104011,-0.481165,0.850229
4,5,±1.45,1.05774,0.165562,0.515018
5,6,±1.34,0.562861,1.39285,-0.063328
6,7,±0.12,1.2076,-0.00204021,1.6278
7,8,±0.35,1.03753,-0.385684,0.519818
8,9,±1.69,-1.32596,1.42898,-2.08935
9,10,±0.13,0.631523,-0.586538,0.29072


In [1]:
def magnify():
    return [dict(selector="th",
                 props=[("font-size", "4pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

In [4]:
np.random.seed(25)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
bigdf = pd.DataFrame(np.random.randn(20, 25)).cumsum()

bigdf.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '1pt'})\
    .set_caption("Hover to magnify")\
    .set_precision(2)\
    .set_table_styles(magnify())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0.23,1.0,-0.84,-0.59,-0.96,-0.22,-0.62,1.8,-2.1,0.87,-0.92,-0.23,2.2,-1.3,0.076,-1.2,1.2,-1.0,1.1,-0.42,2.3,-2.6,2.8,0.68,-1.6
1,-1.7,1.6,-1.1,-1.1,1.0,0.0037,-2.5,3.4,-1.7,1.3,-0.52,-0.015,1.5,-1.1,-1.9,-1.1,-0.68,-0.81,0.35,-0.055,1.8,-2.8,2.3,0.78,0.44
2,-0.65,3.2,-1.8,0.52,2.2,-0.37,-3.0,3.7,-1.9,2.5,0.21,-0.24,-0.1,-0.78,-3.0,-0.82,-0.21,-0.23,0.86,-0.68,1.4,-4.9,3.0,1.9,0.61
3,-1.6,3.7,-2.3,0.43,4.2,-0.43,-3.9,4.2,-2.1,1.1,0.12,0.6,-0.89,0.27,-3.7,-2.7,-0.31,-1.6,1.4,-1.8,0.91,-5.8,2.8,2.1,0.28
4,-3.3,4.5,-1.9,-1.7,5.2,-1.0,-3.8,4.7,-0.72,1.1,-0.18,0.83,-0.22,-1.1,-4.3,-2.9,-0.97,-1.8,1.5,-1.8,2.2,-6.3,3.3,2.5,2.1
5,-0.84,4.2,-1.7,-2.0,5.3,-0.99,-4.1,3.9,-1.1,-0.94,1.2,0.087,-1.8,-0.11,-4.5,-0.85,-2.1,-1.4,0.8,-1.6,1.5,-6.5,2.8,2.1,3.8
6,-0.74,5.4,-2.1,-1.1,4.2,-1.8,-3.2,3.8,-3.2,-1.2,0.34,0.57,-1.8,0.54,-4.4,-1.8,-4.0,-2.6,-0.2,-4.7,1.9,-8.5,3.3,2.5,5.8
7,-0.44,4.7,-2.3,-0.21,5.9,-2.6,-1.8,5.5,-4.5,-3.2,-1.7,0.18,0.11,0.036,-6.0,-0.45,-6.2,-3.9,0.71,-3.9,0.67,-7.3,3.0,3.4,6.7
8,0.92,5.8,-3.3,-0.65,6.0,-3.2,-1.8,5.6,-3.5,-1.3,-1.6,0.82,-2.4,-0.4,-6.1,-0.52,-6.6,-3.5,-0.043,-4.6,0.51,-5.8,3.2,2.4,5.1
9,0.38,5.5,-4.5,-0.8,7.1,-2.6,-0.44,5.3,-2.0,-0.33,-0.8,0.26,-3.4,-0.82,-6.1,-2.6,-8.5,-4.5,0.41,-4.7,1.9,-6.9,2.1,3.0,5.2


In [29]:
from IPython.html import widgets
@widgets.interact
def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0., 99.9), l=(0., 99.9)):
    return df.style.background_gradient(
        cmap=sns.palettes.diverging_palette(h_neg=h_neg, h_pos=h_pos, s=s, l=l,
                                            as_cmap=True)
    )

interactive(children=(IntSlider(value=179, description='h_neg', max=359), IntSlider(value=179, description='h_…

### Bar charts

In [24]:
df.style.bar(subset=['A', 'B'], color='#d65f5f')

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


New in version 0.20.0 is the ability to customize further the bar chart: You can now have the df.style.bar be centered on zero or midpoint value (in addition to the already existing way of having the min value at the left side of the cell), and you can pass a list of [color_negative, color_positive].

Here’s how you can change the above with the new align='mid' option:

In [26]:
df.style.bar(subset=['A', 'B'], align='mid', color=['#d65f5f', '#5fba7d'])

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


### Table styles

The next option you have are “table styles”. These are styles that apply to the table as a whole, but don’t look at the data. Certain sytlings, including pseudo-selectors like :hover can only be used this way.

table_styles should be a list of dictionaries. Each dictionary should have the selector and props keys. The value for selector should be a valid CSS selector. Recall that all the styles are already attached to an id, unique to each Styler. This selector is in addition to that id. The value for props should be a list of tuples of ('attribute', 'value').

table_styles are extremely flexible, but not as fun to type out by hand. We hope to collect some useful ones either in pandas, or preferable in a new package that builds on top the tools here.

In [30]:
from IPython.display import HTML

def hover(hover_color="#ffff99"):
    return dict(selector="tr:hover",
                props=[("background-color", "%s" % hover_color)])

styles = [
    hover(),
    dict(selector="th", props=[("font-size", "150%"),
                               ("text-align", "center")]),
    dict(selector="caption", props=[("caption-side", "bottom")])
]
html = (df.style.set_table_styles(styles)
          .set_caption("Hover to highlight."))
html

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


### Sharing styles

Say you have a lovely style built up for a DataFrame, and now you want to apply the same style to a second DataFrame. Export the style with df1.style.export, and import it on the second DataFrame with df1.style.set

In [27]:
df2 = -df
style1 = df.style.applymap(color_negative_red)
style1

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


In [28]:
style2 = df2.style
style2.use(style1.export())
style2

Unnamed: 0,A,B,C,D,E
0,-1,-1.32921,,0.31628,0.99081
1,-2,1.07082,1.43871,-0.564417,-0.295722
2,-3,1.6264,-0.219565,-0.678805,-1.88927
3,-4,-0.961538,-0.104011,0.481165,-0.850229
4,-5,-1.45342,-1.05774,-0.165562,-0.515018
5,-6,1.33694,-0.562861,-1.39285,0.063328
6,-7,-0.121668,-1.2076,0.00204021,-1.6278
7,-8,-0.354493,-1.03753,0.385684,-0.519818
8,-9,-1.68658,1.32596,-1.42898,2.08935
9,-10,0.12982,-0.631523,0.586538,-0.29072


Notice that you’re able share the styles even though they’re data aware. The styles are re-evaluated on the new DataFrame they’ve been used upon.

## Interactive controls via widgets

In [3]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

### Load data

In [59]:
%%time
data_path = '..\..\Thesis\Thesis\Data\HHSaleHistory_cleaned_v0.9_GTHA_DA.csv'
data_df = pd.read_csv(data_path)
data_df.drop(["Unnamed: 0", 'index'], axis=1, inplace=True)
print("DataFrame 'data_df' contains {0:,} rows and {1} columns.".format(len(data_df), len(data_df.columns)))

DataFrame 'data_df' contains 6,062,853 rows and 21 columns.
Wall time: 49.3 s


In [60]:
data_df.head()

Unnamed: 0,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,unitno,street_name,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,1805-01-06,180,1805,62,174140492,,,ON,,,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1856-12-27,185,1856,62,174140490,,,ON,,,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,1861-12-02,186,1861,62,174140491,3300.0,,ON,,,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
3,1910-12-03,191,1910,62,174140099,200.0,,ON,,Hwy 53 E,...,,Ancaster,,99.0,-79.978293,43.202851,8685,7504,Hamilton,46208.0
4,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,,Anson,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0


### Pivot the table

In [65]:
groupby1 = 'da_city'
groupby2 = 'year'
value_column = 'consideration_amt'
da_city_name = 'Toronto'

data_pivot = data_df_no_outliers.groupby(by=[groupby1, groupby2])[value_column]

In [109]:
@interact
def means_greater_than(greater_than=(0, 5000000, 100000), less_than=(100000, 10000000, 100000)):     # shows municipalities and years with mean price > $5'000'000
    print("Municipalities with: ${0:,} < Mean Annual Price < ${1:,}"
          .format(greater_than, less_than))
    return pd.DataFrame(data_pivot.mean()[(data_pivot.mean() > greater_than) & (data_pivot.mean() < less_than)])

interactive(children=(IntSlider(value=2500000, description='greater_than', max=5000000, step=100000), IntSlide…

### Plot and compare cities

#### Goup values by `'da_city'` and `'year'`

In [71]:
groupby1 = 'da_city'
groupby2 = 'year'
value_column = 'consideration_amt'
da_city_name = 'Toronto'

data_pivot = data_df_no_outliers.groupby(by=[groupby1, groupby2])[value_column]

#### Generate a list of cities

In [94]:
city_list = data_pivot.count().index.get_level_values(0).value_counts().index

#### Make an interactive plot

In [115]:
@interact
def compare_cities(da_city_name1=city_list, da_city_name2=city_list):
    f, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

    def plot_city(axis, da_city_name):
        
        plt.suptitle("Price comparison between {0} and {1}".format(da_city_name1, da_city_name2))
        data_pivot.median()[da_city_name].plot(ax=axes[axis], color='deeppink', linestyle='--', label='Median price')
        data_pivot.mean()[da_city_name].plot(ax=axes[axis], color='coral', linestyle=':', label='Mean price')
        data_pivot.min()[da_city_name].plot(ax=axes[axis], color='green', linestyle='-', label='Min price')
        data_pivot.max()[da_city_name].plot(ax=axes[axis], color='red', linestyle='-', label='Max price')

        axes[axis].set_title("Showing prices for {0}".format(da_city_name))
        axes[axis].set_ylabel("Price")
        axes[axis].set_yscale('log')
        axes[axis].set_xlabel("Year")
        axes[axis].legend(loc='best')
        
        # get data of x axis for fill_between
        line = axes[axis].lines[0]
        
        axes[axis].fill_between(line.get_xdata(), 
                                data_pivot.min()[da_city_name], 
                                data_pivot.median()[da_city_name],
                                color='green',
                                alpha=0.1)
        
        axes[axis].fill_between(line.get_xdata(), 
                                data_pivot.median()[da_city_name], 
                                data_pivot.mean()[da_city_name],
                                color='yellow',
                                alpha=0.1)
        
        axes[axis].fill_between(line.get_xdata(), 
                                data_pivot.mean()[da_city_name], 
                                data_pivot.max()[da_city_name],
                                color='red',
                                alpha=0.1)
    
    plot_city(0, da_city_name1)
    plot_city(1, da_city_name2)
    
    plt.show()

interactive(children=(Dropdown(description='da_city_name1', options=('Hamilton', 'Toronto', 'Clarington', 'Mis…

## Interactive date selector

### Load data

In [140]:
%%time
data_path = '..\..\Thesis\Thesis\Data\HHSaleHistory_cleaned_v0.9_GTHA_DA.csv'
data_df = pd.read_csv(data_path)
data_df.drop(["Unnamed: 0", 'index'], axis=1, inplace=True)
data_df['registration_date'] = pd.to_datetime(data_df['registration_date'])
print("DataFrame 'data_df' contains {0:,} rows and {1} columns.".format(len(data_df), len(data_df.columns)))

DataFrame 'data_df' contains 6,062,853 rows and 21 columns.
Wall time: 28.4 s


In [141]:
data_df.head()

Unnamed: 0,registration_date,decade,year,lro_num,pin,consideration_amt,postal_code,province,unitno,street_name,...,street_direction,municipality,street_suffix,street_number,x,y,index_right,da_id,da_city,da_median_tot_inc
0,1805-01-06,180,1805,62,174140492,,,ON,,,...,,Hamilton,,,-79.97742,43.203291,8685,7504,Hamilton,46208.0
1,1856-12-27,185,1856,62,174140490,,,ON,,,...,,Hamilton,,,-79.977832,43.202926,8685,7504,Hamilton,46208.0
2,1861-12-02,186,1861,62,174140491,3300.0,,ON,,,...,,Hamilton,,,-79.977349,43.203006,8685,7504,Hamilton,46208.0
3,1910-12-03,191,1910,62,174140099,200.0,,ON,,Hwy 53 E,...,,Ancaster,,99.0,-79.978293,43.202851,8685,7504,Hamilton,46208.0
4,1955-10-20,195,1955,62,174140094,,L9G2M2,ON,,Anson,...,,Hamilton,,491.0,-79.978124,43.204117,8685,7504,Hamilton,46208.0


### Generate a list of cities

In [142]:
city_list = data_df['da_city'].value_counts().index
city_list

Index(['Toronto', 'Mississauga', 'Brampton', 'Hamilton', 'Markham', 'Vaughan',
       'Oakville', 'Burlington', 'Richmond Hill', 'Oshawa', 'Whitby', 'Milton',
       'Ajax', 'Clarington', 'Pickering', 'Newmarket', 'Caledon',
       'Halton Hills', 'Georgina', 'Aurora', 'Whitchurch-Stouffville', 'King',
       'Scugog', 'East Gwillimbury', 'Uxbridge', 'Brock',
       'Mississaugas of Scugog Island'],
      dtype='object')

### Plot with interactive date range and city selection

In [149]:
def date_subset_plot(city, start_date, end_date):
    mask1 = data_df['da_city'] == city 
    mask2 = data_df['registration_date'] > start_date
    mask3 = data_df['registration_date'] < end_date
    data_df.loc[(mask1 & mask2 & mask3), 'consideration_amt'].plot()
#    data_df_subset.query('(da_city == city) & (registration_date > start_date) & (registration_date < end_date)', 
#                         inplace=True)
#    data_df_subset['consideration_amt'].plot()
        
interact(date_subset_plot,
        city=city_list,
        start_date=widgets.DatePicker(value=pd.to_datetime('2000-01-01')),
        end_date=widgets.DatePicker(value=pd.to_datetime('2001-01-01')))

interactive(children=(Dropdown(description='city', options=('Toronto', 'Mississauga', 'Brampton', 'Hamilton', …

<function __main__.date_subset_plot(city, start_date, end_date)>

In [109]:
@interact
def means_greater_than(greater_than=(0, 5000000, 100000), less_than=(100000, 10000000, 100000)):     # shows municipalities and years with mean price > $5'000'000
    print("Municipalities with: ${0:,} < Mean Annual Price < ${1:,}"
          .format(greater_than, less_than))
    return pd.DataFrame(data_pivot.mean()[(data_pivot.mean() > greater_than) & (data_pivot.mean() < less_than)])

interactive(children=(IntSlider(value=2500000, description='greater_than', max=5000000, step=100000), IntSlide…