# Data Examples

## Filling Missing Values with Group-Specific Values

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(np.random.randn(6))
s

0    0.001377
1   -0.646745
2    0.526530
3   -0.303665
4    0.455939
5   -0.660081
dtype: float64

In [5]:
s[::2] = np.nan
s

0         NaN
1   -0.646745
2         NaN
3   -0.303665
4         NaN
5   -0.660081
dtype: float64

In [6]:
s.fillna(s.mean())

0   -0.536830
1   -0.646745
2   -0.536830
3   -0.303665
4   -0.536830
5   -0.660081
dtype: float64

In [9]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ["East"] * 4 + ["West"] * 4

data = pd.Series(np.random.randn(8), index=states)

data

Ohio          1.829851
New York      1.170167
Vermont      -1.233230
Florida      -0.515692
Oregon       -1.227997
Nevada       -0.244491
California    1.801142
Idaho        -0.857455
dtype: float64

In [11]:
data [["Vermont", "Nevada", "Idaho"]] = np.nan
data

Ohio          1.829851
New York      1.170167
Vermont            NaN
Florida      -0.515692
Oregon       -1.227997
Nevada             NaN
California    1.801142
Idaho              NaN
dtype: float64

In [13]:
data.groupby(group_key).mean()

East    0.828108
West    0.286572
dtype: float64

In [14]:
fill_mean = lambda g: g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)

Ohio          1.829851
New York      1.170167
Vermont       0.828108
Florida      -0.515692
Oregon       -1.227997
Nevada        0.286572
California    1.801142
Idaho         0.286572
dtype: float64

In [15]:
fill_values = {"East" : 0.5, "West" : -1}
fill_func = lambda g: g.fillna(fill_values[g.name])

In [16]:
data.groupby(group_key).apply(fill_func)

Ohio          1.829851
New York      1.170167
Vermont       0.500000
Florida      -0.515692
Oregon       -1.227997
Nevada       -1.000000
California    1.801142
Idaho        -1.000000
dtype: float64

## Random Sampling and Permutation

In [19]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)
deck[:10]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
dtype: int64

In [20]:
def draw(deck, n=5):
    return deck.sample(n)

In [21]:
draw(deck)

AS      1
2H      2
7C      7
10D    10
7S      7
dtype: int64

To get rwo random cards from each suit. The suit is the last character of each card name.

In [22]:
get_suit = lambda card: card[-1]  # Last letter in suit

In [23]:
deck.groupby(get_suit).apply(draw, n=2)

C  JC    10
   3C     3
D  3D     3
   JD    10
H  3H     3
   6H     6
S  6S     6
   3S     3
dtype: int64

## Group Weighted Average and Correlation

In [24]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,1.850518,0.889425
1,a,0.925874,0.522814
2,a,0.802221,0.966948
3,a,0.278701,0.080638
4,b,1.018079,0.98615
5,b,-1.222352,0.662894
6,b,-0.107244,0.435506
7,b,0.607858,0.285924


In [25]:
grouped = df.groupby("category")
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb877046690>

In [26]:
get_wavg = lambda g: np.average(g["data"], weights=g["weights"])
grouped.apply(get_wavg)

category
a    1.190384
b    0.135325
dtype: float64

### Financial dataset example

In [27]:
close_px = pd.read_csv("../examples/stock_px_2.csv")
close_px.head(5)

Unnamed: 0.1,Unnamed: 0,AAPL,MSFT,XOM,SPX
0,2003-01-02 00:00:00,7.4,21.11,29.22,909.03
1,2003-01-03 00:00:00,7.45,21.14,29.24,908.59
2,2003-01-06 00:00:00,7.45,21.52,29.96,929.01
3,2003-01-07 00:00:00,7.43,21.93,28.95,922.93
4,2003-01-08 00:00:00,7.28,21.31,28.83,909.93


In [28]:
close_px.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2214 entries, 0 to 2213
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2214 non-null   object 
 1   AAPL        2214 non-null   float64
 2   MSFT        2214 non-null   float64
 3   XOM         2214 non-null   float64
 4   SPX         2214 non-null   float64
dtypes: float64(4), object(1)
memory usage: 86.6+ KB


In [29]:
spx_corr = lambda x: x.corrwith(x["SPX"])

In [32]:
get_year = lambda x: x.year

## Group-Wise Linear Regression

In [35]:
import statsmodels.api as sm

def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1
    result = sm.OLS(Y, X).fit()
    return result.params