In [2]:
import numpy as np
import pandas as pd

X = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

In [3]:
mean = X.mean(axis=0)
std  = X.std(axis=0)
x_norm = (X-mean) / std
print(x_norm)

[[-1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487]]


In [4]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, np.nan, 30, np.nan]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,
2,Charlie,30.0
3,David,


In [5]:
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)
print(df)

      Name   Age
0    Alice  25.0
1      Bob  27.5
2  Charlie  30.0
3    David  27.5


In [6]:
df = pd.DataFrame({
    'A': [1, np.nan, 3],
    'B': [4, 5, np.nan],
    'C': [np.nan, np.nan, 9]
})
missing_value_count = df.isna().sum()
print(missing_value_count)

A    1
B    1
C    2
dtype: int64


In [7]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, np.nan],
    'B': [np.nan, np.nan, np.nan, np.nan],
    'C': [5, 6, 7, 8]
})
df

Unnamed: 0,A,B,C
0,1.0,,5
1,2.0,,6
2,,,7
3,,,8


In [8]:
threshold = len(df)*0.5
threshold

2.0

In [9]:
df_cleaned = df.loc[:, df.isna().sum() <= threshold]
df_cleaned

Unnamed: 0,A,C
0,1.0,5
1,2.0,6
2,,7
3,,8


In [10]:
X = np.array([[1, 100, 3],
              [4, 200, 6],
              [7, 300, 9]])
means = X.mean(axis=0)
stds = X.std(axis=0)
medians = np.median(X, axis=0)

In [11]:
A = np.array([[1, 2],     # shape: (2, 2)
              [3, 4]])

B = np.array([[5, 6],     # shape: (2, 2)
              [7, 8]])

In [12]:
# A[:, np.newaxis, :] # shape (2,1,2)
# B[np.newaxis, :, : ] # shape (1,2,2)
diff = A[:, np.newaxis, :] - B[np.newaxis, :, :]  # shape: (2, 2, 2)
squared_diff = diff ** 2 # shape still (2, 2, 2)
sum_squared = squared_diff.sum(axis=2)
dists = np.sqrt(sum_squared)
print(dists)

[[5.65685425 8.48528137]
 [2.82842712 5.65685425]]


In [16]:
import math

A = [[1, 2],
     [3, 4]]

B = [[5, 6],
     [7, 8]]

In [20]:
def euclidean(p1, p2):
    return math.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

In [21]:
result = []
for a in A:
    row = []
    for b in B:
        d = euclidean(a, b)
        row.append(d)
    result.append(row)
for r in result:
    print(r)

[5.656854249492381, 8.48528137423857]
[2.8284271247461903, 5.656854249492381]


In [25]:
Y = np.array([[1, 100, 3],
              [4, 200, 6],
              [7, 300, 9]])

In [27]:
means = Y.mean(axis=0)
stds = Y.std(axis=0)
medians = np.median(Y, axis=0)

print(means)

[  4. 200.   6.]


In [None]:
outlier = np.abs(Y-mean) > 2 *stds
outlier

array([[False, False, False],
       [False,  True, False],
       [False,  True, False]])

In [30]:
X[outlier] = np.take(medians, np.where(outlier)[1])
print(X)

[[  1 100   3]
 [  4 200   6]
 [  7 200   9]]


In [31]:
df = pd.DataFrame({
    'Name': ['John', 'Sara'],
    'Math': [90, 85],
    'English': [88, 92]
})

In [32]:
df['Score'] = df['Math'] + df['English']
print(df)

   Name  Math  English  Score
0  John    90       88    178
1  Sara    85       92    177


In [35]:
dists = np.sqrt(
    np.sum(A**2, axis=1)[:, np.newaxis] +   # shape (N,1)
    np.sum(B**2, axis=1)[np.newaxis, :] -   # shape (1,M)
    2 * np.dot(A, B.T)                      # shape (N,M)
)
dists

array([[5.65685425],
       [2.82842712]])

In [36]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [5, 2, 6, 4, 1]
})

In [37]:
df[df['A'].isin(df['B'])]
df

Unnamed: 0,A,B
0,1,5
1,2,2
2,3,6
3,4,4
4,5,1


In [38]:
df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'C'],
    'value': [10, 20, 30, 100, 200, 1000]
})
df

Unnamed: 0,group,value
0,A,10
1,A,20
2,A,30
3,B,100
4,B,200
5,C,1000


In [39]:
df['normalized'] = df['value'] - df.groupby('group').value.transform('mean')
df

Unnamed: 0,group,value,normalized
0,A,10,-10.0
1,A,20,0.0
2,A,30,10.0
3,B,100,-50.0
4,B,200,50.0
5,C,1000,0.0


In [40]:
from numpy.lib.stride_tricks import sliding_window_view
arr = np.array([1,2,3,4,5])
windows = sliding_window_view(arr, window_shape=3)
windows

array([[1, 2, 3],
       [2, 3, 4],
       [3, 4, 5]])

In [41]:
import numpy as np
a = np.array([1.0, np.nan, 3.0, np.nan])
a[np.isnan(a)] == np.nanmean(a)

array([False, False])

In [43]:
scores = np.array([5, 10, 15, 20])
labels = np.digitize(scores, bins=[10, 15])
labels

array([0, 1, 2, 2])

In [45]:
A = np.random.rand(100, 64)
B = np.random.rand(64, 32)
result = np.einsum('ij,jk->ik', A, B)
result

array([[16.18819565, 15.35992503, 15.74293335, ..., 17.83793713,
        16.69482021, 15.5265293 ],
       [16.02478283, 15.00347507, 15.68579976, ..., 18.01965052,
        15.63094016, 16.00269325],
       [14.25734285, 13.54943257, 13.69871075, ..., 14.47137024,
        13.87763972, 13.39592279],
       ...,
       [17.20606692, 14.80748826, 16.71755803, ..., 16.4766562 ,
        14.7577812 , 17.17545127],
       [15.03162225, 14.77341814, 16.06868997, ..., 15.61969051,
        13.94395685, 15.02246728],
       [14.90101214, 14.68375447, 14.45484661, ..., 15.09031658,
        13.84616254, 15.06720374]])

In [47]:
df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B', 'B'],
    'score': [90, 85, 88, 75, 95, 80, 70]
})
df['rank'] = df.groupby('group')['score'].rank(ascending=True)
df

Unnamed: 0,group,score,rank
0,A,90,3.0
1,A,85,1.0
2,A,88,2.0
3,B,75,2.0
4,B,95,4.0
5,B,80,3.0
6,B,70,1.0


In [48]:
top3=df[df['rank'] <= 3]
top3

Unnamed: 0,group,score,rank
0,A,90,3.0
1,A,85,1.0
2,A,88,2.0
3,B,75,2.0
5,B,80,3.0
6,B,70,1.0


In [49]:
import pandas as pd

# Sample data
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Angela', 'David', 'Andy', 'Brian'],
    'score': [85, 90, 78, 88, 92, 80]
})
df

Unnamed: 0,name,score
0,Alice,85
1,Bob,90
2,Angela,78
3,David,88
4,Andy,92
5,Brian,80


In [50]:
result = df[df['name'].str.startswith('A')]
result

Unnamed: 0,name,score
0,Alice,85
2,Angela,78
4,Andy,92


In [51]:
df = pd.DataFrame({
    'col': ['A', 'B', 'C', 'A', 'B', 'D']
})
df

Unnamed: 0,col
0,A
1,B
2,C
3,A
4,B
5,D


In [52]:
df['col'] = df['col'].replace({'A':1, 'B':2})
df

Unnamed: 0,col
0,1
1,2
2,C
3,1
4,2
5,D


In [54]:
arr = np.array([10, 50, 30, 20, 40])
arr_sort = np.argsort(arr)
arr_sort

array([0, 3, 2, 4, 1])

In [56]:
top3_indices = arr_sort[-3:][::-1]
top3_indices

array([1, 4, 2])

In [58]:
print(arr[top3_indices])  # [50 40 30]

[50 40 30]
