In [1]:
import numpy as np
import pandas as pd

In [2]:
states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]
data = pd.Series(np.random.standard_normal(8), index=states)
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()

East   -0.178746
West    0.700581
dtype: float64

In [3]:
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()

East   -0.178746
West    0.700581
dtype: float64

In [4]:
def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

East  Ohio          0.918950
      New York     -0.688230
      Vermont      -0.178746
      Florida      -0.766958
West  Oregon        0.452453
      Nevada        0.700581
      California    0.948708
      Idaho         0.700581
dtype: float64

In [5]:
fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

East  Ohio          0.918950
      New York     -0.688230
      Vermont       0.500000
      Florida      -0.766958
West  Oregon        0.452453
      Nevada       -1.000000
      California    0.948708
      Idaho        -1.000000
dtype: float64

In [6]:
suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [7]:
deck.head(13)

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [8]:
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

AH     1
KS    10
QH    10
2D     2
8S     8
dtype: int64

In [9]:
def get_suit(card):
    # last letter is suit
    return card[-1]

deck.groupby(get_suit).apply(draw, n=2)

C  10C    10
   3C      3
D  10D    10
   AD      1
H  6H      6
   KH     10
S  JS     10
   4S      4
dtype: int64

In [10]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

QC    10
7C     7
2D     2
3D     3
2H     2
4H     4
QS    10
8S     8
dtype: int64

In [11]:
df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df

Unnamed: 0,category,data,weights
0,a,-0.874279,0.46429
1,a,0.910834,0.568262
2,a,0.267241,0.377574
3,a,-1.022781,0.356351
4,b,1.420747,0.337612
5,b,0.549279,0.948102
6,b,-0.017391,0.41191
7,b,0.812022,0.442152


In [12]:
grouped = df.groupby("category")
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

grouped.apply(get_wavg)

  grouped.apply(get_wavg)


category
a   -0.085986
b    0.631985
dtype: float64

In [13]:
# Read the CSV file
close_px = pd.read_csv("stock_px.csv", parse_dates=True, index_col=0)

# Display DataFrame information
close_px.info()

# Display the last 4 rows of the DataFrame
print(close_px.tail(4))  # Add print to display the rows

# Define the spx_corr function
def spx_corr(group):
    return group.corrwith(group["SPX"])

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AA      5472 non-null   float64
 1   AAPL    5472 non-null   float64
 2   GE      5472 non-null   float64
 3   IBM     5472 non-null   float64
 4   JNJ     5472 non-null   float64
 5   MSFT    5472 non-null   float64
 6   PEP     5471 non-null   float64
 7   SPX     5472 non-null   float64
 8   XOM     5472 non-null   float64
dtypes: float64(9)
memory usage: 427.5 KB
               AA    AAPL     GE     IBM    JNJ   MSFT    PEP      SPX    XOM
2011-10-11  10.30  400.29  16.14  185.00  63.96  27.00  60.95  1195.54  76.27
2011-10-12  10.05  402.19  16.40  186.12  64.33  26.96  62.70  1207.25  77.16
2011-10-13  10.10  408.43  16.22  186.82  64.23  27.18  62.36  1203.66  76.37
2011-10-14  10.26  422.00  16.60  190.53  64.72  27.27  62.24  1224.58  78.11


In [14]:
def spx_corr(group):
    return group.corrwith(group["SPX"])

In [15]:
rets = close_px.pct_change(fill_method=None).dropna()

In [16]:
def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078
1995,0.291532,0.161829,0.519126,0.41639,0.315733,0.45366,0.413144,1.0,0.368752
1996,0.292344,0.191482,0.750724,0.388497,0.569232,0.564015,0.421477,1.0,0.538736
1997,0.564427,0.211435,0.827512,0.646823,0.703538,0.606171,0.509344,1.0,0.695653
1998,0.533802,0.379883,0.815243,0.623982,0.591988,0.698773,0.494213,1.0,0.369264
1999,0.099033,0.425584,0.710928,0.486167,0.517061,0.631315,0.336593,1.0,0.315383


In [17]:
def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])
by_year.apply(corr_aapl_msft)

1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.578729
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [18]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [20]:
by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

Unnamed: 0,SPX,intercept
1990,1.512772,0.001395
1991,1.187351,0.000396
1992,1.832427,0.000164
1993,1.39047,-0.002657
1994,1.190277,0.001617
1995,0.858818,-0.001423
1996,0.829389,-0.001791
1997,0.749928,-0.001901
1998,1.164582,0.004075
1999,1.384989,0.003273
