# NumPy Examples

In [1]:
import numpy as np

## DataCamp Top 20

https://www.datacamp.com/blog/numpy-interview-questions

### (1) What is NumPy?

* Provides support for large, multi-dimentional arrays

* Provides mathematical functions

* Allows for vectorized operations (loops not needed)

* Provides foundation for more advanced packages: pandas, scikit learn, pytorch

* Data storage types are more efficient than Python native objects

### (2) Create a 1D array

In [4]:
arr = np.array([1, 2, 3])
arr

array([1, 2, 3])

### (3) Differences between NumPy array and Python list

NumPy lists:

* must be all of same type

* are more memory efficient

* support vectorized operations (loops not needed)

* operate natively with many mathematical operations

### (4) Size and shape of arrays

In [12]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
print(arr.shape) # dimensions
print(arr.size) # total number of elements

(2, 3)
6


### (5) Reshape arrays

In [17]:
arr = np.array([1, 2, 3, 4, 5, 6])
print(arr)
print(arr.reshape((3,2))) # method
print(np.reshape(arr, (2,3))) # function

[1 2 3 4 5 6]
[[1 2]
 [3 4]
 [5 6]]
[[1 2 3]
 [4 5 6]]


### (6) Arrays of zeroes or ones

In [20]:
print(np.zeros((1,6)))
np.ones((3,2))

[[0. 0. 0. 0. 0. 0.]]


array([[1., 1.],
       [1., 1.],
       [1., 1.]])

### (7) Broadcasting

Ensures that dimensions match before operations

In [23]:
a = np.array([1, 2, 3]) # horizontal
b = np.array([[1], [1], [1]]) # vertical
print(a + b)

[[2 3 4]
 [2 3 4]
 [2 3 4]]


### (8) Basic statistics

In [44]:
uni_arr = np.random.uniform(0, 10, 100)
print(np.mean(uni_arr))
print(np.median(uni_arr))
print(np.std(uni_arr))

4.924511945827471
5.03524060064375
2.763200589928771


### (9) Vectorized if-else with `np.where`

In [46]:
nor_arr = np.random.normal(0, 10, 100)
np.where(nor_arr > 0, True, False)

array([False,  True, False, False,  True,  True,  True,  True,  True,
       False,  True, False,  True, False,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True,  True, False,  True,  True,  True, False, False,  True,
        True, False, False, False,  True, False,  True,  True,  True,
       False, False,  True, False, False, False,  True,  True, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True, False, False,  True,
       False,  True,  True,  True,  True,  True, False, False, False,
        True,  True, False, False,  True,  True,  True, False,  True,
       False,  True,  True, False, False,  True,  True,  True, False,
        True])

### (10) Vectorized mathematical operations (MSE)

In [48]:
arr1 = np.random.uniform(0, 10, 100)
arr2 = np.random.uniform(0, 10, 100)
n = len(arr1)

print( 1/n * np.sum( np.square(arr2 - arr1)) )
print( np.mean( np.square(arr2-arr1) ) )

17.195712832042254
17.19571283204225


### (11) Sliding window

In [54]:
from numpy.lib.stride_tricks import sliding_window_view

arr = np.arange(6)
print(arr)
print(sliding_window_view(arr, 3))
print(np.mean(sliding_window_view(arr, 3), axis=1)) # mean over horizontal rows

[0 1 2 3 4 5]
[[0 1 2]
 [1 2 3]
 [2 3 4]
 [3 4 5]]
[1. 2. 3. 4.]


### (12) Indexing

In [71]:
arr = np.array([[10, 15, 20, 25],
                  [30, 35, 40, 45],
                  [50, 55, 60, 65]])

condition = arr > 30
print(arr[ condition ])

rows = np.array([0, 1, 2])
cols = np.array([1, 2, 3])
print(arr[ rows, cols ])

[35 40 45 50 55 60 65]
[15 40 65]


### (13) Matrix decomposition

In [105]:
M = np.random.randn(9,9) # 9x9

U, s, Vt = np.linalg.svd(M, full_matrices=False) # M = U * S * V.T

V = Vt.T

print(U.shape, s.shape, V.shape) # 9x9, 9x1, 9x9

S = np.diag(s)

Mhat = np.dot(U, np.dot(S, V.T))

np.allclose(M,Mhat)

(9, 9) (9,) (9, 9)


True

### (14) Memory optimization

In [106]:
# large_array = np.memmap(filename mode='w+' shape=large_array_shape)

### (15) Missing values

In [128]:
arr = np.array([0, 1, 2, 3, 4])
arr = arr/arr
print(np.isnan(arr))
print(np.isinf(arr))

[ True False False False False]
[False False False False False]


  arr = arr/arr


### (16) Applying functions along rows/cols

In [134]:
arr = np.array([[10, 15, 20, 25],
                  [30, 35, 40, 45],
                  [50, 55, 60, 65]])
print(arr)

np.apply_along_axis(np.mean, axis=0, arr=arr)

[[10 15 20 25]
 [30 35 40 45]
 [50 55 60 65]]


array([30., 35., 40., 45.])

### (17) Feature scaling

In [139]:
data = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

minvals = np.min(data, axis=0)
maxvals = np.max(data, axis=0)
print(minvals)
print(maxvals)
scaled_data = (data - minvals)/(maxvals - minvals)
scaled_data

[1 2 3]
[7 8 9]


array([[0. , 0. , 0. ],
       [0.5, 0.5, 0.5],
       [1. , 1. , 1. ]])

### (18) Sort and index

In [None]:
# np.argsort() # provide indices to sort an array, e.g. to align two data sets

### (19) Random seed

In [145]:
np.random.seed(42)

np.random.normal(0, 1)

0.4967141530112327

### (20) K-means

In [146]:
# Generate a sample dataset
np.random.seed(42)  # For reproducibility
data = np.vstack([
    np.random.normal(loc=[1, 1], scale=0.5, size=(50, 2)),
    np.random.normal(loc=[5, 5], scale=0.5, size=(50, 2)),
    np.random.normal(loc=[9, 1], scale=0.5, size=(50, 2))
])
def k_means(X, k, max_iters=100, tol=1e-4):
    # Step 1: Initialize centroids randomly
    num_samples, num_features = X.shape
    centroids = X[np.random.choice(num_samples, k, replace=False)]
    
    for i in range(max_iters):
        # Step 2: Assign clusters
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        cluster_assignments = np.argmin(distances, axis=1)
        
        # Step 3: Update centroids
        new_centroids = np.array([X[cluster_assignments == j].mean(axis=0) for j in range(k)])
        
        # Check for convergence
        if np.all(np.linalg.norm(new_centroids - centroids, axis=1) < tol):
            break
        
        centroids = new_centroids
    
    return centroids, cluster_assignments

# Apply k-means clustering
k = 3
centroids, cluster_assignments = k_means(data, k)

In [147]:
centroids, cluster_assignments

(array([[1.13031116, 0.61786063],
        [0.76336861, 1.25884335],
        [7.00837347, 3.03522695]]),
 array([0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
        1, 1, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

## InterviewBit

https://www.interviewbit.com/numpy-interview-questions/#program-for-interchanging-two-axes-of-numpy-arrays

## Geeks for Geeks

https://www.geeksforgeeks.org/numpy-interview-questions/

## UPES

https://upesonline.ac.in/blog/frequently-asked-numpy-interview-questions

## OkCupid Data Test

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../../../Downloads/OKCupid. Kirkegaard and Bjerrekær. 2016/temp/user_data_public.csv')

  df = pd.read_csv('../../../Downloads/OKCupid. Kirkegaard and Bjerrekær. 2016/temp/user_data_public.csv')


In [5]:
dfsmall = df.iloc[0:5,:]

In [6]:
del df

In [10]:
dfsmall.shape

(5, 2625)

In [12]:
dfsmall.iloc[:, 2605:2616]

Unnamed: 0,q85932,q85947,q86019,q86210,q86215,q86283,q86325,q86364,q86397,q86462,q86615
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,


In [15]:
column_names = sorted(dfsmall.columns)
column_names

['CA',
 'CA_items',
 'd_age',
 'd_astrology_seriosity',
 'd_astrology_sign',
 'd_bodytype',
 'd_country',
 'd_drinks',
 'd_drugs',
 'd_education_phase',
 'd_education_type',
 'd_ethnicity',
 'd_gender',
 'd_income',
 'd_job',
 'd_languages',
 'd_offspring_current',
 'd_offspring_desires',
 'd_orientation',
 'd_relationship',
 'd_religion_seriosity',
 'd_religion_type',
 'd_smokes',
 'gender',
 'gender2',
 'gender2_num',
 'gender_orientation',
 'lf_for',
 'lf_location',
 'lf_max_age',
 'lf_min_age',
 'lf_single',
 'lf_want',
 'p_adven',
 'p_aggre',
 'p_ambi',
 'p_arro',
 'p_artsy',
 'p_capi',
 'p_comp',
 'p_conf',
 'p_convenmoral',
 'p_cool',
 'p_dominant',
 'p_drug',
 'p_energetic',
 'p_exer',
 'p_explife',
 'p_explove',
 'p_expsex',
 'p_extro',
 'p_friendstrangers',
 'p_geeky',
 'p_giving',
 'p_greed',
 'p_honest',
 'p_inde',
 'p_indie',
 'p_introvert',
 'p_kind',
 'p_kinky',
 'p_laidback',
 'p_lit',
 'p_logic',
 'p_lovedri',
 'p_manners',
 'p_math',
 'p_oldfash',
 'p_opti',
 'p_organ

In [18]:
rel_columns = ['CA','CA_items','d_age','d_bodytype','d_ethnicity','gender','d_income','d_job','d_relationship']
dfsmall[rel_columns]

Unnamed: 0,CA,CA_items,d_age,d_bodytype,d_ethnicity,gender,d_income,d_job,d_relationship
0,0.76308,4.0,25.0,,White,Woman,,Education,Single
1,,0.0,20.0,,,Man,,,Single
2,0.661309,7.0,22.0,,,Woman,,,Single
3,,0.0,29.0,Thin,White,Woman,,Medicine,Single
4,0.875424,3.0,30.0,Average,,Woman,,,Single


### Research Questions

* Predicting innate personal characteristics from opinion data