In [1]:
import numpy as np
import pandas as pd

In [2]:
arr = np.arange(10)
arr2 = np.arange(start=5, stop=15)
arr[arr%2==0].mean()
arr[arr>5]

array([6, 7, 8, 9])

In [3]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
df = pd.DataFrame(({"a":["x", "x", "y", "y", "z"], "b":[1,2,3,4,5]}))
df.groupby("a").apply(lambda x: x["b"].quantile(q=0.9), include_groups=False) #type: ignore

a
x    1.9
y    3.9
z    5.0
dtype: float64

In [5]:
df[df["a"]=="x"][["b"]]

Unnamed: 0,b
0,1
1,2


In [6]:
def remove_outliers(df:pd.DataFrame, column:str, cutoff:float)->pd.DataFrame:
    q1 = df["x"].quantile(q=.25)
    q3 = df["x"].quantile(q=0.75)
    IQR = q3-q1
    inf_limit = q1-cutoff*IQR
    sup_limit = q3+cutoff*IQR
    mask = ((df["x"] > inf_limit) & (df["x"] < sup_limit))
    df_without_outlier = df.loc[mask]
    return df_without_outlier
def remove_outliers_quantile(df:pd.DataFrame, column:str, cutoff:float)->pd.DataFrame:
    upper99 = df["x"].quantile(q=.99)
    lower1 = df["x"].quantile(q=0.01)
    mask =(df["x"]>lower1) & (df["x"]<upper99)
    return df[mask]

In [7]:
np.random.seed(0)
df = pd.DataFrame({"x": np.concatenate([np.random.normal(50, 5, 100), [200, -100]])})
print(df["x"].min())
print(df["x"].max())
clean = remove_outliers(df, "x", cutoff=1.5)
print(len(df), "->", len(clean))

-100.0
200.0
102 -> 100


In [8]:
np.random.seed(0)
df = pd.DataFrame({"x": np.concatenate([np.random.normal(50, 5, 100), [200, -100]])})
clean = remove_outliers_quantile(df, "x", cutoff=1.5)
print(len(df), "->", len(clean))

102 -> 98


# New

In [16]:
df = pd.DataFrame({
    'a': ['x','x','x','y','y','z','z'],
    'b': [1,2,5,1,3,2,2]
})
df.groupby(["a"]).filter(lambda x: (x["b"]==2).any())

Unnamed: 0,a,b
0,x,1
1,x,2
2,x,5
5,z,2
6,z,2


In [None]:
arr = np.random.randint(low=80, high=120)

In [33]:
arr

array([[ 92, 118, 115, 102,  85],
       [103, 112,  91, 100,  90],
       [117, 108,  82, 107,  99],
       [105, 103, 100, 109,  83],
       [115, 119,  89,  89, 103]])

In [35]:
arr = np.arange(1, 11)
arr[arr%3!=0]

array([ 1,  2,  4,  5,  7,  8, 10])

In [36]:
import pandas as pd

df = pd.DataFrame({"a": list("aabbcc"), "b": [1, 2, 3, 4, 5, 6]})
df["rank_in_group"] = df.groupby("a")["b"].rank(method="dense", ascending=False)
# Expected last column: [2,1,2,1,2,1]

In [37]:
df

Unnamed: 0,a,b,rank_in_group
0,a,1,2.0
1,a,2,1.0
2,b,3,2.0
3,b,4,1.0
4,c,5,2.0
5,c,6,1.0


In [73]:
import numpy as np, pandas as pd

np.random.seed(0)
df = pd.DataFrame(
    {
        "feature1": np.concatenate([np.random.normal(100, 10, 100), [300, -50]]),
        "feature2": np.concatenate([np.random.normal(0, 1, 100), [10, -8]]),
    }
)

In [None]:
def outlier_removal(df:pd.DataFrame, column, zquantile:float=1.5)->pd.DataFrame:
    q3 = df[column].quantile(q=.75)
    q1 = df[column].quantile(q=0.25)
    IQR = q3-q1

    mask = ~((df[column]>q1-zquantile*IQR) & (df[column]<q3+zquantile*IQR))
    # df = df.copy()
    df.loc[mask, [column]] = np.nan
    return df

In [75]:
df

Unnamed: 0,feature1,feature2
0,117.640523,1.883151
1,104.001572,-1.347759
2,109.787380,-1.270485
3,122.408932,0.969397
4,118.675580,-1.173123
...,...,...
97,117.858705,0.823504
98,101.269121,2.163236
99,104.019894,1.336528
100,300.000000,10.000000


In [None]:
outlier_removal(df, "feature1")

Unnamed: 0,feature1,feature2
0,117.640523,1.883151
1,104.001572,-1.347759
2,109.787380,-1.270485
3,122.408932,0.969397
4,118.675580,-1.173123
...,...,...
97,117.858705,0.823504
98,101.269121,2.163236
99,104.019894,1.336528
100,,10.000000


In [89]:
from scipy import stats
df[(np.abs(stats.zscore(df))<1).all(axis=1)]

Unnamed: 0,feature1,feature2
1,104.001572,-1.347759
2,109.787380,-1.270485
3,122.408932,0.969397
4,118.675580,-1.173123
6,109.500884,-0.413619
...,...,...
94,103.563664,0.523277
95,107.065732,-0.171546
96,100.105000,0.771791
97,117.858705,0.823504


In [88]:
df

Unnamed: 0,feature1,feature2
0,117.640523,1.883151
1,104.001572,-1.347759
2,109.787380,-1.270485
3,122.408932,0.969397
4,118.675580,-1.173123
...,...,...
97,117.858705,0.823504
98,101.269121,2.163236
99,104.019894,1.336528
100,300.000000,10.000000


In [91]:
df = pd.DataFrame(
    {"a": ["x", "x", "x", "y", "y", "z", "z", "z"], "b": [1, 2, 5, 1, 3, 2, 4, 9]}
)
df.groupby("a").filter(lambda x: (x["b"]>=4).any())

Unnamed: 0,a,b
0,x,1
1,x,2
2,x,5
5,z,2
6,z,4
7,z,9


In [None]:
df = pd.DataFrame(
    {
        "city": ["NY", "NY", "LA", "LA", "CHI", "CHI"],
        "temp": [30, 35, 25, 100, 20, 18],
        "rain": [0, 5, 10, 0, 20, 0],
    }
)
df.groupby("city").filter(lambda x: ((x["temp"]>90) and (x["rain"]==0)).any())
df.groupby('city').filter(lambda g: ((g['temp']>90)    &(g['rain']==0)).any())

In [102]:
arr = np.random.randint(0, 100, (4,4))
np.where(arr>70, 70, arr)
np.clip(arr, a_min=0, a_max=70)

array([[ 0, 70, 36, 70],
       [20,  3, 42, 65],
       [20, 36, 68, 70],
       [47, 10, 70, 70]])

In [103]:
arr = np.random.randn(100)
p95 = np.percentile(arr, 95)
arr_clipped = np.clip(arr, None, p95)
# Verify: np.max(arr_clipped) == p95


In [104]:
p95

np.float64(1.8454502827806256)