# Python Basics

In [1]:
# Variable assignment
x = 5
x  # the last line of a cell will always be printed, unless it is a variable assignment

5

In [2]:
# example for format string 
# the 'f' before the string indicates that it is a format string
# withing the string {variable_name} will substitute the variable
print(f"x={x}")
print(f"I can print the value of x here: {x} and again if I want to here: {x}, or I can make any expression within the curly brackets {x+1}")

# we can print sequence of values if we just list everything withing the print
print("Text", x, "Another Text")

# Basic arithmetics within Python
print("Addition: x+2=", x + 2)
print("Subtraction: x-2=", x - 2)
print("Multiplication: x*2=", x * 2)
print("Division: x/2=", x / 2)
print("Floor division: x//2=", x // 2)
print("Modulus: x%2=", x % 2)
print("Exponentiation: x^2=", x ** 2)

x=5
I can print the value of x here: 5 and again if I want to here: 5, or I can make any expression within the curly brackets 6
Text 5 Another Text
Addition: x+2= 7
Subtraction: x-2= 3
Multiplication: x*2= 10
Division: x/2= 2.5
Floor division: x//2= 2
Modulus: x%2= 1
Exponentiation: x^2= 25


In [3]:
x > 5  # Python Logical Operators: <, <=, >, >=, ==, !=, and, or, not

False

In [4]:
True | False  # bitwise operators: & (and), | (or), ^ (xor)

True

## Data Structures

In [5]:
# the name of the variable can be anything, I used an underscore (_) after each nameto differentiate them from built-in functions such as list, tuple, dict, set
list_ = [12, 4, 5, 8, 9]  # list, mutable (which means that it can be modified)
tuple_ = (12, 4, 6, 8, 9)  # tuple, immutable (which means it can not be modified)
dict_ = {'first': 12, 'second': 5, 'third': -66}  # dictionary or map
set_ = {3, 0, 1, 0, 3, 2, -3}  # set

In [6]:
# we can print any of them
set_

{-3, 0, 1, 2, 3}

In [7]:
# we can access an element of a list or tuple as
list_[1]

4

In [8]:
# we can use a so called sliching which selects multiple elements
# it is in the format of "start:stop:step_size"
list_[:2]  #first 2 elements

[12, 4]

In [9]:
list_[2:5]  # from index 2 to 5 (not included)

[5, 8, 9]

In [10]:
list_[1:6:2]  # from index 1 to 6 (not included) with the step size of 2

[4, 8]

In [11]:
# we can use negative indices as well which starts the indexing from the end of the array
list_[-1]  # last element of the array

9

In [12]:
list_[-2:]  # last to element of the array

[8, 9]

In [13]:
list_[1:-1]  # every element except the first and the last

[4, 5, 8]

In [14]:
# by not providing value, the defaults are: start=0, stop=len(array), step_size=1
# so if we just wish to select every second element from the array we can just provide the step size as:
list_[::2]

[12, 5, 9]

In [15]:
# we can access the elements of a dictionary with its keys
dict_["second"]

5

## Loops and List Comprehension

In [16]:
# we can define a for loop within python as "for element_variable in structure"
for x in list_:
    print(x)

12
4
5
8
9


In [17]:
# we can define ranges
for i in range(5):
    print(i)

0
1
2
3
4


In [18]:
# we can iterate structures with indices
for i in range(len(list_)):
    print(list_[i])  # list_[i] accesses the ith element of list_

12
4
5
8
9


In [19]:
# or if we need the value and index at once
for i, value in enumerate(list_):
    print(i, value)

0 12
1 4
2 5
3 8
4 9


In [20]:
# iterating dictionaries if a little bit different
# accessing keys only
for key in dict_.keys():
    print(key)  # we can access the value as dict_[key] if we want to 

first
second
third


In [21]:
# accessing values only
for value in dict_.values():
    print(value)

12
5
-66


In [22]:
# accessing both keys and values
for key, value in dict_.items():
    print(key, value)

first 12
second 5
third -66


In [23]:
# list comprehension is a compact way of accessing elements
# for example lets say we want to filter values from a list, lets say we want to keep the positive values only
numbers = [-1, 3, 2, -4, 5, 1, 4, -3, 1]

# we can do this normally as
positives = []  # empty list that will contain the positives

for x in numbers:
    if x >= 0:                # if x is positive
        positives.append(x)   # then append the value to the list of positives
positives

[3, 2, 5, 1, 4, 1]

In [24]:
# the same thing can be written faster and more efficiently as list comprehension
# which is compact and runs faster (why it runs faster doesnt matter) than the previous version 
positives = [x for x in numbers if x >= 0]  # the final solution
positives

[3, 2, 5, 1, 4, 1]

In [25]:
# a list comprehension is a little bit more complicated than a simple for loop
# the overall syntax can be summarized as
# newlist = [expression for item in iterable <if condition == True, optional>]
# if the condition on the right side is True than it will put the value of the expression in the list, otherwise continues with the next element
# the important bit is that the expression can be anything and it can make it look complicated
# lets say we want to do the following:
# 1. keep positive elements only
# 2. add 1 to the element if it is even, and raise the number to the power of 2 if it is odd

In [26]:
result = [x+1 if x % 2 == 0 else x**2 for x in numbers if x >= 0]
# the left side of the for is an expression "x+1 if x % 2 == 0 else x**2" and the right side of the for is a filter condition "if x >= 0"
# first python check "if x >= 0" if it is True then the expression (left side) gets evaluated
# "x+1 if x % 2 == 0 else x**2" = write x+1 in to the list if x % 2 == 0, otherwise write x**2 into the list
result

[9, 3, 25, 1, 5, 1]

In [27]:
# the list comprehension above is equal to the following:

result = []
for x in numbers:
    if x >=0:  # is positive
        if x % 2 == 0:  # is even
            result.append(x + 1)
        else:
            result.append(x**2)

result

[9, 3, 25, 1, 5, 1]

In [28]:
# similar list comprehension can be constructed for dictionaries as well, but it is a little bit different
# {key: expression for key, value in dictionary.items() if condition == True}
# for example, lets filter for the key-values pairs which has a positive value
{key: value for key, value in dict_.items() if value >=0}

{'first': 12, 'second': 5}

In [29]:
# In a similar manner as earlier, lets do the following:
# 1. keep positive key-value pairs only
# 2. add 1 to the value if it is even, and raise the number to the power of 2 if it is odd
{key: value+1 if value % 2 == 0 else value**2 for key, value in dict_.items() if value >=0}

{'first': 13, 'second': 25}

### Exercise

In [30]:
# Find all of the numbers from 1-1000 that are divisible by 7
[x for x in range(1, 1001) if x % 7 == 0]

[7,
 14,
 21,
 28,
 35,
 42,
 49,
 56,
 63,
 70,
 77,
 84,
 91,
 98,
 105,
 112,
 119,
 126,
 133,
 140,
 147,
 154,
 161,
 168,
 175,
 182,
 189,
 196,
 203,
 210,
 217,
 224,
 231,
 238,
 245,
 252,
 259,
 266,
 273,
 280,
 287,
 294,
 301,
 308,
 315,
 322,
 329,
 336,
 343,
 350,
 357,
 364,
 371,
 378,
 385,
 392,
 399,
 406,
 413,
 420,
 427,
 434,
 441,
 448,
 455,
 462,
 469,
 476,
 483,
 490,
 497,
 504,
 511,
 518,
 525,
 532,
 539,
 546,
 553,
 560,
 567,
 574,
 581,
 588,
 595,
 602,
 609,
 616,
 623,
 630,
 637,
 644,
 651,
 658,
 665,
 672,
 679,
 686,
 693,
 700,
 707,
 714,
 721,
 728,
 735,
 742,
 749,
 756,
 763,
 770,
 777,
 784,
 791,
 798,
 805,
 812,
 819,
 826,
 833,
 840,
 847,
 854,
 861,
 868,
 875,
 882,
 889,
 896,
 903,
 910,
 917,
 924,
 931,
 938,
 945,
 952,
 959,
 966,
 973,
 980,
 987,
 994]

In [31]:
# Find all of the numbers from 1-1000 that have a 3 in them 
[x for x in range(1, 1001) if "3" in str(x)]

[3,
 13,
 23,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 43,
 53,
 63,
 73,
 83,
 93,
 103,
 113,
 123,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 143,
 153,
 163,
 173,
 183,
 193,
 203,
 213,
 223,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 243,
 253,
 263,
 273,
 283,
 293,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 319,
 320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 337,
 338,
 339,
 340,
 341,
 342,
 343,
 344,
 345,
 346,
 347,
 348,
 349,
 350,
 351,
 352,
 353,
 354,
 355,
 356,
 357,
 358,
 359,
 360,
 361,
 362,
 363,
 364,
 365,
 366,
 367,
 368,
 369,
 370,
 371,
 372,
 373,
 374,
 375,
 376,
 377,
 378,
 379,
 380,
 381,
 382,
 383,
 384,
 385,
 386,
 387,
 388,
 389,
 390,
 391,
 392,
 393,
 394,
 395,
 396,
 397,
 398,
 399,
 403,
 413,
 423,
 430,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,


In [32]:
# Find all of the strings that has an "d" in them
words = ["middle", "catalogue", "exceed", "board", "familiar", "reward", "satellite", "grace", "respectable"]

[x for x in words if "d" in x]

['middle', 'exceed', 'board', 'reward']

## Functions and Lambdas

In [33]:
def add(a, b):
    return a + b

add(2, 3)

5

In [34]:
# a lambda function is an inline anonymous function
# signiture:
# lambda parameter_list: expression
add = lambda a, b: a + b

add(2, 3)

5

### Exercise

In [35]:
# make a lambda function the calculates the sum of positive values within a list
# use the earlier list "numbers" as input
# use the "sum" built-in function

sum_pos = lambda inp: sum([x for x in inp if x > 0])

sum_pos(numbers)

16

# Numpy

- What is NumPy?
  - Python library for numerical computing.
  - Provides the `ndarray` ($N$ Dimensional Array) object for fast, vectorized operations.
- Why NumPy vs. plain Python lists?
  - Speed, memory efficiency, mathematical operations, broadcasting.

In [36]:
import numpy as np

## Matrix Creation

In [37]:
# creation from Python list
np.array([1, 5, -1])

array([ 1,  5, -1])

In [38]:
# 2D array
np.array([[1, 5, -1], [2, -6, 3]])

array([[ 1,  5, -1],
       [ 2, -6,  3]])

In [39]:
# allocate array (values can be random numbers)
# it will allocate memory for the array but if something was in the memory at that address earlier then it will be part of the array
# the only parameter is the shape which is a tuple in this case
np.empty((2, 3))  # empty array with 2 rows and 3 column

array([[0., 0., 0.],
       [0., 0., 0.]])

In [40]:
# array filled with zeros
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [41]:
# array filled with ones
np.ones((2, 3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [42]:
# array filled with any contant value
np.full((2, 3), fill_value=7)

array([[7, 7, 7],
       [7, 7, 7]])

In [43]:
# creating a sequence of numbers
# np.arange(from, to, step_size)
np.arange(0, 10, 2)

array([0, 2, 4, 6, 8])

In [44]:
# return evenly spaced numbers over a specified interval
# np.arange(from, to, number_of_intervals)
np.linspace(0, 10, 3)

array([ 0.,  5., 10.])

In [45]:
np.random.seed(0)   # sets the seed, this will guranatee that everyone will generate the random numbers, by default it will use the current timestamp as seed

In [46]:
np.random.rand(5)   # generates 5 random number from uniform distribution

array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ])

In [47]:
np.random.randn(5)  # generates 5 random numbers from normal distribution

array([-0.84272405,  1.96992445,  1.26611853, -0.50587654,  2.54520078])

In [48]:
np.random.randn(2, 5)  # generates random numbers from normal distribution with shape (2=rows, 5=columns)

array([[ 1.08081191,  0.48431215,  0.57914048, -0.18158257,  1.41020463],
       [-0.37447169,  0.27519832, -0.96075461,  0.37692697,  0.03343893]])

In [49]:
# generates a natrix by multivariate normal distribution
# first parameter is the expected value for each column: [0, 0, 0]
# second parameter is the covariance matrix for each column: [[1, 0.3, 0.1], [0.3, 1, -0.5], [0.1, -0.5, 1]]
# third parameters is the nmber of vectors: 10
np.random.multivariate_normal([0, 0, 0], [[1, 0.3, 0.1], [0.3, 1, -0.5], [0.1, -0.5, 1]], 10)

array([[-1.50423714, -0.98811025, -0.5029329 ],
       [ 1.53141428,  0.21533174,  0.48396581],
       [ 0.90580116,  0.71829499,  1.36554954],
       [ 0.20466754, -0.94503822,  0.11772901],
       [ 1.24807438, -1.34321751,  1.79932714],
       [-0.20210781, -0.64725475, -0.50587925],
       [-0.37907779, -0.93381374,  0.00514848],
       [-2.15681569, -0.97032769, -0.15470106],
       [-0.06549545, -0.06250764, -0.1670241 ],
       [-0.9948253 , -0.58715758, -0.25731277]])

In [50]:
# Checking the dimensionality of a matrix/vector
# everey ndarray defines the shape attribute which returns the size of each dimension
np.random.randn(2, 5).shape

(2, 5)

### Exercise

In [51]:
# create a vector with numbers going from 1 to 100
np.arange(1, 101)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [52]:
# create a vector with numbers going from 100 to 1
np.arange(100, 0, -1)

array([100,  99,  98,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,
        87,  86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,
        74,  73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  63,  62,
        61,  60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,
        48,  47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,
        35,  34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,
        22,  21,  20,  19,  18,  17,  16,  15,  14,  13,  12,  11,  10,
         9,   8,   7,   6,   5,   4,   3,   2,   1])

In [54]:
# create a random uniform matrix with 3 dimension, first dimension has a size of 20, second dimension has a size of 10, third dimension has a size of 100
rand_array = np.random.rand(20, 10, 100)

In [55]:
# print the shape of the previous array
rand_array.shape

(20, 10, 100)

In [56]:
# create a 3x3 matrix filled with -1
np.full((3, 3), fill_value=-1)

array([[-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])

## Indexing and Slicing

In [57]:
# lets say we have the following 2D array
X = np.random.randn(2, 5)
X

array([[-0.02797118, -0.95543822, -2.2140114 ,  0.66995701,  0.15731949],
       [-2.28176183, -0.53971977, -0.85383656,  1.58448495,  1.22748888]])

In [58]:
# we can access a row by
X[0]  # or X[0, :], ":" selects all element within that dimension, can be ommited in this case

array([-0.02797118, -0.95543822, -2.2140114 ,  0.66995701,  0.15731949])

In [59]:
# we can access a column by
X[:, 0]  # ":" selects all row and from all row selects column 0

array([-0.02797118, -2.28176183])

In [60]:
# we can more precisely select sub-matrices
X[:2, :2]  # select the first 2 row, and select the first 2 columns. ":2" is a shorthand for "0:2"

array([[-0.02797118, -0.95543822],
       [-2.28176183, -0.53971977]])

In [61]:
# lets shift this selection windows by 1
X[:2, 1:3]

array([[-0.95543822, -2.2140114 ],
       [-0.53971977, -0.85383656]])

In [62]:
# we can even use negative indices which will start the index from then end of that dimension
X[:2, -2:]  # this is the same as X[:2, X.shape[1]-2:]

array([[0.66995701, 0.15731949],
       [1.58448495, 1.22748888]])

In [63]:
X[:2, X.shape[1]-2:]

array([[0.66995701, 0.15731949],
       [1.58448495, 1.22748888]])

### Exercise

In [64]:
# select the last column of the matrix X
X[:, -1]

array([0.15731949, 1.22748888])

In [65]:
# from the last row select all element except the first and last column
X[-1, 1:-1]

array([-0.53971977, -0.85383656,  1.58448495])

## Basic Operations

### Vectors

In [66]:
# lets generate 2 vectors
a = np.arange(1, 6)
b = np.arange(1, 11, 2)

In [67]:
a

array([1, 2, 3, 4, 5])

In [68]:
b

array([1, 3, 5, 7, 9])

In [69]:
# element-wise addition
a + b

array([ 2,  5,  8, 11, 14])

In [70]:
# element-wise substruction
a - b

array([ 0, -1, -2, -3, -4])

In [71]:
# element-wise multiplication
a * b

array([ 1,  6, 15, 28, 45])

In [72]:
# element-wise division
a / b

array([1.        , 0.66666667, 0.6       , 0.57142857, 0.55555556])

In [73]:
# special operand within numpy is the @ symbol which indicates inner product
a @ b

np.int64(95)

In [74]:
np.inner(a, b)  # same as above

np.int64(95)

In [75]:
# outer product of vectors
np.outer(a, b)

array([[ 1,  3,  5,  7,  9],
       [ 2,  6, 10, 14, 18],
       [ 3,  9, 15, 21, 27],
       [ 4, 12, 20, 28, 36],
       [ 5, 15, 25, 35, 45]])

### Matricies

Generally the same operations can be used.

In [76]:
A = np.random.randn(2, 5)
B = np.random.randn(2, 5)

In [77]:
A

array([[ 0.12585735,  0.27970633,  1.13560639,  0.34497634, -0.64259535],
       [ 1.03438622, -0.39439203,  0.17620982, -0.50576309,  1.28557353]])

In [78]:
# transpose
A.T

array([[ 0.12585735,  1.03438622],
       [ 0.27970633, -0.39439203],
       [ 1.13560639,  0.17620982],
       [ 0.34497634, -0.50576309],
       [-0.64259535,  1.28557353]])

In [79]:
# matrix multiplication
A @ B.T  # same as np.matmul(A, B.T)

array([[ 0.80583635,  1.62243314],
       [-2.17816274,  0.04733462]])

## Universal Functions

This is a limited list of funtions.

In [80]:
np.abs(A)  # absolue value

array([[0.12585735, 0.27970633, 1.13560639, 0.34497634, 0.64259535],
       [1.03438622, 0.39439203, 0.17620982, 0.50576309, 1.28557353]])

In [81]:
np.sqrt(np.abs(A))  # squere root (using abs. just to remove comeplex roots)

array([[0.35476379, 0.5288727 , 1.06564834, 0.58734687, 0.80162045],
       [1.01704779, 0.62800639, 0.41977354, 0.71117022, 1.13383135]])

In [82]:
np.power(A, 3)  # power of 3

array([[ 1.99358952e-03,  2.18830015e-02,  1.46448012e+00,
         4.10551774e-02, -2.65346116e-01],
       [ 1.10674655e+00, -6.13457370e-02,  5.47129787e-03,
        -1.29372327e-01,  2.12466646e+00]])

In [83]:
np.sin(A)   # sinus, similarly np.cos, np.tan... can be used

array([[ 0.12552535,  0.2760734 ,  0.90678998,  0.33817441, -0.59927515],
       [ 0.85954884, -0.38424696,  0.17529936, -0.48447513,  0.95959899]])

## Reduction Operations

This is a limited list of functions.

In [84]:
np.sum(A)  # sums all value of the input

np.float64(2.839565510790611)

In [85]:
np.mean(A)  # average of all elements

np.float64(0.2839565510790611)

In [86]:
np.min(A)  # minimum of all elements

np.float64(-0.6425953496660816)

In [87]:
np.max(A)  # maximum of all elements

np.float64(1.2855735274180435)

In [88]:
# for reduction operation we can specify the direction as well
# for example, if we want to calculate the mean of all rows or mean of all columns independently
A.shape

(2, 5)

In [89]:
np.mean(A, axis=0)  # mean of all columns

array([ 0.58012178, -0.05734285,  0.65590811, -0.08039337,  0.32148909])

In [90]:
np.mean(A, axis=1)  # mean of all rows

array([0.24871021, 0.31920289])

## Reshaping

In [91]:
# we already discussed one of the simplest "reshaping" feature: transposition
A.T

array([[ 0.12585735,  1.03438622],
       [ 0.27970633, -0.39439203],
       [ 1.13560639,  0.17620982],
       [ 0.34497634, -0.50576309],
       [-0.64259535,  1.28557353]])

In [92]:
# we can reshape this matrix as we wish (within certain constraints)
A_ = A.reshape(1, 10)
A_.shape

(1, 10)

In [93]:
# reshape can take -1 as a parameter once, it means that numpy should calculate it in context of other values provided
# an example

# a vector with 18 elements
d = np.random.randn(18)
d.reshape(-1, 3)  # it means that I want 3 dimension and infer the number of rows accordingly, so it will just calculate 18/3=6 as the number of rows

array([[-0.81560399, -0.23608568,  1.21568023],
       [ 1.53362803,  0.12731766, -0.86448979],
       [-1.69458313, -0.73411366, -0.70291698],
       [-0.8084669 , -1.10107604, -1.24449648],
       [ 0.82409125, -0.49420312,  0.08728518],
       [ 0.38186301, -0.81453144,  2.12891516]])

In [94]:
# we can transform an arroy to a vector
A_ = A.flatten()
A_.shape

(10,)

In [95]:
# we can concatenate different matricies as long as they have the same dimensions (otherthan the dimension of cancatenation)
A_ = np.concatenate([A, B], axis=0)
A_.shape

(4, 5)

In [96]:
A_ = np.concatenate([A, B], axis=1)
A_.shape

(2, 10)

## Boolean Selection

In [97]:
# we can mask operation with boolean masks
# a boolean mask is just a vector or matrix (same shape as the input) that has a True value for each position where we wish to make some modification
# lets start with vectors
np.random.seed(0)
a = np.random.randn(10)
a

array([ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799,
       -0.97727788,  0.95008842, -0.15135721, -0.10321885,  0.4105985 ])

In [98]:
# lets select each positive values
a[a >= 0]

array([1.76405235, 0.40015721, 0.97873798, 2.2408932 , 1.86755799,
       0.95008842, 0.4105985 ])

In [99]:
# where a >= 0 is just a vector with True values where the condition is satisfied
a >= 0

array([ True,  True,  True,  True,  True, False,  True, False, False,
        True])

In [100]:
# now lets set each negative value to 0
a[a < 0] = 0
a

array([1.76405235, 0.40015721, 0.97873798, 2.2408932 , 1.86755799,
       0.        , 0.95008842, 0.        , 0.        , 0.4105985 ])

In [101]:
# not lets add 1 to each value that is less than 1
a[a < 1] += 1
a

array([1.76405235, 1.40015721, 1.97873798, 2.2408932 , 1.86755799,
       1.        , 1.95008842, 1.        , 1.        , 1.4105985 ])

In [102]:
# The same thing can be done with matricies but the mask has the shape of the input matrix
np.random.seed(0)
A = np.random.randn(4, 4)
A

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ],
       [ 1.86755799, -0.97727788,  0.95008842, -0.15135721],
       [-0.10321885,  0.4105985 ,  0.14404357,  1.45427351],
       [ 0.76103773,  0.12167502,  0.44386323,  0.33367433]])

In [103]:
# the selection will return a vector of masked values, due to the number of returned values can not be reshaped into a matrix
A[A>=0]

array([1.76405235, 0.40015721, 0.97873798, 2.2408932 , 1.86755799,
       0.95008842, 0.4105985 , 0.14404357, 1.45427351, 0.76103773,
       0.12167502, 0.44386323, 0.33367433])

In [104]:
# the mask
A>=0

array([[ True,  True,  True,  True],
       [ True, False,  True, False],
       [False,  True,  True,  True],
       [ True,  True,  True,  True]])

In [105]:
# but we can do everything else just like earlier
# lets set each negative value to 0
A[A < 0] = 0  # this is an inplace operation
A

array([[1.76405235, 0.40015721, 0.97873798, 2.2408932 ],
       [1.86755799, 0.        , 0.95008842, 0.        ],
       [0.        , 0.4105985 , 0.14404357, 1.45427351],
       [0.76103773, 0.12167502, 0.44386323, 0.33367433]])

In [106]:
# not lets add 1 to each value that is less than 1
A[A < 1] += 1  # this is an inplace operation
A

array([[1.76405235, 1.40015721, 1.97873798, 2.2408932 ],
       [1.86755799, 1.        , 1.95008842, 1.        ],
       [1.        , 1.4105985 , 1.14404357, 1.45427351],
       [1.76103773, 1.12167502, 1.44386323, 1.33367433]])

In [107]:
# we can also select specific rows or columns with vector like masks
# lets say we want select each row with an expected value of less than 1.45

# first calculate the expected value for each row
expected_value = np.mean(A, axis=1)
expected_value

array([1.84596018, 1.4544116 , 1.2522289 , 1.41506258])

In [108]:
# then make the mask
mask = (expected_value < 1.45)
mask

array([False, False,  True,  True])

In [109]:
# the we have to index the correct dimension with this mask
A[mask]

array([[1.        , 1.4105985 , 1.14404357, 1.45427351],
       [1.76103773, 1.12167502, 1.44386323, 1.33367433]])

In [110]:
# the same can be done with the matrix columns

# we have to reduce along the other axes
expected_value = np.mean(A, axis=0)
mask = (expected_value < 1.45)
# we have to mask the other axes
A[:, mask]  # select all row, select columns with mask

array([[1.40015721],
       [1.        ],
       [1.4105985 ],
       [1.12167502]])

In [111]:
mask  # the value of the mask

array([False,  True, False, False])

## Broadcasting

Broadcasting is an important concept within any matrix engine. It can be seen as an efficient method to fill in values when making operations between an $N$ dimensional and an $N+1$ dimensional matrix. IN detail see [Numpy Broadcasting Guide](https://numpy.org/doc/stable/user/basics.broadcasting.html).

![broadcasting](https://numpy.org/doc/stable/_images/broadcasting_1.png)


![broadcasting2](https://numpy.org/doc/stable/_images/broadcasting_2.png)

In [112]:
# lets demonstrate it by following the example on the first image
a = np.array([1, 2, 3])
b = np.array([2])

# in order to be able to multiply each element with 2 we have to virtually stretch vector b, in other words broadcast it to all other positions. 
a * b

array([2, 4, 6])

In [113]:
# a more interesting example can be seen in the second image
# we have to broad cast the values of vector b to all rows

a = np.array([[0, 0, 0], [10, 10, 10], [20, 20, 20], [30, 30, 30]])
b = np.array([1, 2, 3])

a + b

array([[ 1,  2,  3],
       [11, 12, 13],
       [21, 22, 23],
       [31, 32, 33]])

In [114]:
# lets see a different scenario
# lets say we want to add the values of vector b to each column
# first we have to add an extra element to vector b to match the number of rows

a = np.array([[0, 0, 0], [10, 10, 10], [20, 20, 20], [30, 30, 30]])
b = np.array([1, 2, 3, 4])

# this simply gives an error because the shape does not match, numpy can not infer the direction of broadcasting, we have to fix that
a + b

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

![bb](https://numpy.org/doc/stable/_images/broadcasting_3.png)

In [115]:
# to solve this problem we have to add a virtual dimension to our vector

# -1 means that numpy should calculate the shape at that positon 
# numpy knows the overall shape of the input matrix, it knows that we want 1 at the second position, so it will just infer "num_values/1"
b_ = b.reshape(-1, 1)
b_.shape  # 4 rows, 1 column

(4, 1)

In [116]:
# so the final solution is
a + b_

array([[ 1,  1,  1],
       [12, 12, 12],
       [23, 23, 23],
       [34, 34, 34]])

In [117]:
# a shorter solution
a + b[:, None]  # b[:, None] == b.reshape(-1, 1), selects all value to the rows and appends a dimension

array([[ 1,  1,  1],
       [12, 12, 12],
       [23, 23, 23],
       [34, 34, 34]])

## Exercise

In [119]:
# Create a NumPy array of numbers from 1 to 36.
# Reshape it into a 6×6 matrix.
# Print its shape
M = np.arange(1, 37).reshape(6, -1)
M.shape

(6, 6)

In [121]:
# Extract the third row.
M[2]

array([13, 14, 15, 16, 17, 18])

In [122]:
# Extract the last column.
M[:, -1]

array([ 6, 12, 18, 24, 30, 36])

In [123]:
# Extract the submatrix of the middle 4×4 block.
M[1:-1, 1:-1]

array([[ 8,  9, 10, 11],
       [14, 15, 16, 17],
       [20, 21, 22, 23],
       [26, 27, 28, 29]])

In [124]:
# Multiply the 6×6 matrix by 2 (elementwise).
M * 2

array([[ 2,  4,  6,  8, 10, 12],
       [14, 16, 18, 20, 22, 24],
       [26, 28, 30, 32, 34, 36],
       [38, 40, 42, 44, 46, 48],
       [50, 52, 54, 56, 58, 60],
       [62, 64, 66, 68, 70, 72]])

In [125]:
# Compute the mean of all elements.
np.mean(M)

np.float64(18.5)

In [128]:
# Compute the variance of each column.
np.var(M, axis=0)

array([105., 105., 105., 105., 105., 105.])

In [130]:
# Create a vector [10, 20, 30, 40, 50, 60].
# Add this vector to each column of the 6×6 matrix.

v = np.array([10, 20, 30, 40, 50, 60])
M + v[:, None]

array([[11, 12, 13, 14, 15, 16],
       [27, 28, 29, 30, 31, 32],
       [43, 44, 45, 46, 47, 48],
       [59, 60, 61, 62, 63, 64],
       [75, 76, 77, 78, 79, 80],
       [91, 92, 93, 94, 95, 96]])

In [132]:
# Generate an array of random numbers from a uniform distribution with a shape of (1000, 50)
# Calculate the variance of each row
M2 = np.random.rand(1000, 50)
np.var(M2, axis=1)

array([0.08019682, 0.08062986, 0.07869888, 0.0996127 , 0.07540383,
       0.08389305, 0.07131251, 0.07149637, 0.09836649, 0.0810572 ,
       0.08350971, 0.08497768, 0.06285043, 0.08126365, 0.07164099,
       0.07116978, 0.07659481, 0.08040454, 0.10521286, 0.07911346,
       0.07489526, 0.091466  , 0.09572065, 0.08857888, 0.07259481,
       0.07858917, 0.08627708, 0.08353222, 0.07741176, 0.07704974,
       0.07974892, 0.07555815, 0.1073193 , 0.06879883, 0.07262031,
       0.08643014, 0.0935747 , 0.08360531, 0.07134348, 0.08112006,
       0.06556117, 0.09052378, 0.0818913 , 0.06522798, 0.07683673,
       0.08796303, 0.07627183, 0.08658931, 0.09728523, 0.08226509,
       0.09170128, 0.07937658, 0.07948845, 0.07653938, 0.0959262 ,
       0.07006974, 0.06130753, 0.08061008, 0.08719739, 0.09334893,
       0.08697537, 0.07102049, 0.09780129, 0.08104294, 0.08251831,
       0.10258259, 0.09299911, 0.07921593, 0.09220468, 0.05805966,
       0.08660335, 0.07210917, 0.07303892, 0.0601403 , 0.08316

In [133]:
# Generate an array of 1000 random numbers from a normal distribution (np.random.randn).
# Compute its mean, standard deviation, min, and max.
np.random.seed(0)
M3 = np.random.randn(1000)

mean = np.mean(M3)
std = np.std(M3)
min_ = np.min(M3)
max_= np.max(M3)

mean, std, min_, max_

(np.float64(-0.045256707490195384),
 np.float64(0.9870331586690257),
 np.float64(-3.0461430547999266),
 np.float64(2.759355114021582))

In [136]:
# Count how many numbers are between -1 and 1.
M3[(M3 > -1) & (M3 < 1)].shape[0]

699

In [137]:
# Generate a fake dataset with shape (100, 3) using np.random.rand (values between 0 and 1).
# -> Column 1 = Age (scale to 18–60).
# -> Column 2 = Height (scale to 150–200).
# -> Column 3 = Weight (scale to 50–100).
# Compute the average age, height, and weight.
# Select all rows where age > 30 and weight < 70.


# 1. Generate raw random values in [0,1)
data = np.random.rand(100, 3)

# 2. Scale each column to the desired range
# Age: 18–60  -> 18 + (rand * 42)
# Height: 150–200 -> 150 + (rand * 50)
# Weight: 50–100 -> 50 + (rand * 50)
data[:, 0] = 18 + data[:, 0] * (60 - 18)   # Age
data[:, 1] = 150 + data[:, 1] * (200 - 150)  # Height
data[:, 2] = 50 + data[:, 2] * (100 - 50)    # Weight

# 3. Compute averages
avg_age, avg_height, avg_weight = np.mean(data, axis=0)

print("Average age:", avg_age)
print("Average height:", avg_height)
print("Average weight:", avg_weight)

# 4. Select rows where age > 30 and weight < 70
mask = (data[:, 0] > 30) & (data[:, 2] < 70)
selected_rows = data[mask]

print("Selected rows:\n", selected_rows)

Average age: 39.52472914435385
Average height: 178.26868051742613
Average weight: 76.14055986958195
Selected rows:
 [[ 43.82033522 151.85648019  50.71257576]
 [ 58.35412621 153.25607343  52.22855555]
 [ 47.15166975 189.0596422   58.44630579]
 [ 30.42746301 165.16459607  67.79445773]
 [ 52.03268742 178.8795045   53.76386399]
 [ 49.06456505 197.84352846  63.89949724]
 [ 50.54495073 197.20162332  51.83457089]
 [ 39.67719543 160.99303883  63.71478519]
 [ 45.11286037 167.24282933  55.05537453]
 [ 55.26589235 188.25349746  65.67953059]
 [ 59.59547793 195.60754765  55.9174717 ]
 [ 46.78962013 152.47498721  55.09273076]
 [ 40.42707279 197.2101359   69.83050561]
 [ 38.56676632 156.58436409  69.85068334]
 [ 47.58486465 164.24427603  55.19940388]
 [ 55.00990656 195.82095054  63.57755477]
 [ 50.29997376 183.89474848  65.99169447]
 [ 52.8800065  152.89546384  64.56944103]
 [ 33.72538732 167.69095163  68.91339086]
 [ 59.29754433 151.52132577  59.68116452]
 [ 36.76531946 191.84951826  61.09120153]
 [

# Pandas

What is Pandas?
- Library for working with tabular data (like spreadsheets or SQL tables).
- Provides `Series` (1D) and `DataFrame` (2D).

In [141]:
import pandas as pd

## DataFrame Creation

In [142]:
# from dictionary
input_data = {
    "names": ["John", "Angela", "Mike", "Gustave", "Emili"],
    "age": [51, 22, 30, 17, 25],
    "height": [171, 165, 188, 155, 160],
    "salery": [500, 600, 700, 150, 650]
}
# the keys will be the column names and the lists will be the values for each row
df = pd.DataFrame(data=input_data)
df

Unnamed: 0,names,age,height,salery
0,John,51,171,500
1,Angela,22,165,600
2,Mike,30,188,700
3,Gustave,17,155,150
4,Emili,25,160,650


In [143]:
# from numpy
input_data = np.array(
    [
        ["John", "Angela", "Mike", "Gustave", "Emili"],
        [51, 22, 30, 17, 25],
        [171, 165, 188, 155, 160],
        [500, 600, 700, 150, 650]
    ]
)

column_names = ["names", "age", "height", "salery"]

# we have to provide the column names explicitly
# note that, I used a transposition so that each feature align with the columns
df = pd.DataFrame(data=input_data.T, columns=column_names)
df

Unnamed: 0,names,age,height,salery
0,John,51,171,500
1,Angela,22,165,600
2,Mike,30,188,700
3,Gustave,17,155,150
4,Emili,25,160,650


In [144]:
# we can simply read different file formats where we only have to provide a path or a link
# CSV: read_csv, if we provide sep='\t' then it can read TSV files
# Excel: read_excel
# JSON: read_json
# XML: read_xml
# ...
# check the API references for more

# for now we are going to read the bike rental dataset
df = pd.read_csv("https://github.com/ficstamas/data-mining/raw/b76d5b7913c446878fa47de8861c83e26780828f/data/rental.csv", index_col=0)
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0
...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0


In [145]:
# we can access the names of columns sa
df.columns

Index(['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

In [146]:
# to access a column
df["temp"]  # or df.temp

0      24.175849
1      25.083466
2      17.229108
3      17.400000
4      18.666979
         ...    
726    19.945849
727    19.906651
728    19.906651
729    20.024151
730    18.144151
Name: temp, Length: 731, dtype: float64

In [147]:
# there are two ways select a row from a dataframe
df.iloc[0]  # selects the row that is the 0th in order

season           spring
yr                 2011
mnth            january
holiday             0.0
weekday             6.0
workingday          0.0
weathersit          2.0
temp          24.175849
atemp          39.99925
hum             80.5833
windspeed     10.749882
cnt               985.0
Name: 0, dtype: object

In [148]:
df.loc[0]  # selects the row that has the index of 0 (left column of the dataframe), this will be the same in this case

season           spring
yr                 2011
mnth            january
holiday             0.0
weekday             6.0
workingday          0.0
weathersit          2.0
temp          24.175849
atemp          39.99925
hum             80.5833
windspeed     10.749882
cnt               985.0
Name: 0, dtype: object

In [149]:
# it can be better demonstrated if we permute the rows
# for demonstration purposes, I'm going to select the first 10 rows and then permute them
sub_df = df.sample(n=10, random_state=0)  # .sample will randomly resample the dataframe, n defines the number of samples, random_state defines the random state
sub_df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
196,fall,2011,july,0.0,6.0,0.0,1.0,40.273349,58.125358,58.5,13.958914,5923.0
187,fall,2011,july,0.0,4.0,1.0,1.0,43.25,61.333486,65.125,10.6664,4592.0
14,spring,2011,january,0.0,6.0,0.0,2.0,18.966651,32.375392,49.875,10.583521,1248.0
31,spring,2011,february,0.0,2.0,1.0,2.0,17.032178,31.47898,82.9565,3.565271,1360.0
390,spring,2012,january,0.0,4.0,1.0,2.0,24.058349,39.4993,76.9583,4.917519,4075.0
319,winter,2011,november,0.0,3.0,1.0,3.0,29.463349,45.831208,93.0,9.167543,1817.0
299,winter,2011,october,0.0,4.0,1.0,2.0,30.09,46.165036,81.2917,13.250121,2659.0
702,winter,2012,december,0.0,1.0,1.0,1.0,29.2675,46.082536,76.75,5.542294,6234.0
462,summer,2012,april,0.0,6.0,0.0,1.0,28.5625,44.124514,25.4167,18.416357,6857.0
27,spring,2011,january,0.0,5.0,1.0,2.0,17.563466,30.738922,79.3043,8.2611,1167.0


## Indexing and Slicing

In [150]:
# sub_df.iloc[0] will return the first row
sub_df.iloc[0]

season             fall
yr                 2011
mnth               july
holiday             0.0
weekday             6.0
workingday          0.0
weathersit          1.0
temp          40.273349
atemp         58.125358
hum                58.5
windspeed     13.958914
cnt              5923.0
Name: 196, dtype: object

In [151]:
# but sub_df.loc[0] will return an error
# to get the same element you have to provide the index of the first row
sub_df.loc[196]

season             fall
yr                 2011
mnth               july
holiday             0.0
weekday             6.0
workingday          0.0
weathersit          1.0
temp          40.273349
atemp         58.125358
hum                58.5
windspeed     13.958914
cnt              5923.0
Name: 196, dtype: object

In [152]:
# we can slice just like as we would do in other libraries
df.iloc[:10]

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.99925,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.50073,43.7273,16.636703,1349.0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.4,30.000052,59.0435,10.739832,1562.0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.13182,43.6957,12.5223,1600.0
5,spring,2011,january,0.0,4.0,1.0,1.0,17.604356,31.391794,51.8261,6.000868,1606.0
6,spring,2011,january,0.0,5.0,1.0,2.0,17.236534,29.783374,49.8696,11.304642,1510.0
7,spring,2011,january,0.0,6.0,0.0,2.0,15.755,26.708764,53.5833,17.875868,959.0
8,spring,2011,january,0.0,0.0,0.0,1.0,14.501651,23.66755,43.4167,24.25065,822.0
9,spring,2011,january,0.0,1.0,1.0,1.0,15.089151,25.958608,48.2917,14.958889,1321.0


## Inspecting Data

In [153]:
# displays the first 5 rows
df.head(5)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.99925,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.50073,43.7273,16.636703,1349.0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.4,30.000052,59.0435,10.739832,1562.0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.13182,43.6957,12.5223,1600.0


In [154]:
# displays the last 5 rows
df.tail(5)

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0,10.416557,3095.0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.9984,75.2917,8.333661,1341.0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.2922,48.3333,23.500518,1796.0
730,spring,2012,december,0.0,1.0,1.0,2.0,18.144151,30.750142,57.75,10.374682,2729.0


In [155]:
# displays meta informations, sizes, data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    object 
 1   yr          731 non-null    int64  
 2   mnth        731 non-null    object 
 3   holiday     731 non-null    float64
 4   weekday     731 non-null    float64
 5   workingday  731 non-null    float64
 6   weathersit  731 non-null    float64
 7   temp        731 non-null    float64
 8   atemp       731 non-null    float64
 9   hum         731 non-null    float64
 10  windspeed   731 non-null    float64
 11  cnt         731 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 90.4+ KB


In [156]:
# caclulates percentiles, averages, and standard deviations for each column
# it is good for a fast, first inspection
df.describe()

Unnamed: 0,yr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,2011.500684,0.028728,2.997264,0.683995,1.395349,31.283085,47.307363,62.789406,12.762576,4504.348837
std,0.500342,0.167155,2.004787,0.465233,0.544894,8.603397,10.755438,14.24291,5.192357,1937.211452
min,2011.0,0.0,0.0,0.0,1.0,10.779129,21.218594,0.0,1.500244,22.0
25%,2011.0,0.0,1.0,0.0,1.0,23.842925,38.297605,52.0,9.04165,3152.0
50%,2012.0,0.0,3.0,1.0,1.0,31.421651,48.124378,62.6667,12.125325,4548.0
75%,2012.0,0.0,5.0,1.0,2.0,38.804575,56.167732,73.02085,15.625371,5956.0
max,2012.0,1.0,6.0,1.0,3.0,48.498349,71.499136,97.25,34.000021,8714.0


## Filtering

In [157]:
# just like in numpy, we can use boolean masks to filter the dataset
# lets filter for the datapoints that was recorded in 2012
df[df.yr == 2012]

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
365,spring,2012,january,0.0,0.0,0.0,1.0,25.390000,40.790986,69.2500,12.875189,2294.0
366,spring,2012,january,1.0,1.0,0.0,1.0,20.833021,32.652064,38.1304,22.087555,1951.0
367,spring,2012,january,0.0,2.0,1.0,1.0,15.050000,24.334150,44.1250,24.499957,2236.0
368,spring,2012,january,0.0,3.0,1.0,2.0,13.052500,23.876242,41.4583,12.374900,2368.0
369,spring,2012,january,0.0,4.0,1.0,1.0,20.494151,34.375192,52.4167,8.709129,3272.0
...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0


In [158]:
# now lets filter for the datapoints that was recorded in the summer of 2012
# here "&" performs a logical and operation between the two boolean mask
# when we merge two masks in this manner parenthasis are important
# because "&" would execute before "=="
# so we just ensure the order of operations
df[(df.yr == 2012) & (df.season == "summer")]

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
445,summer,2012,march,0.0,3.0,1.0,2.0,32.988349,49.875028,82.1250,6.000406,6230.0
446,summer,2012,march,0.0,4.0,1.0,1.0,34.045849,51.083422,83.1250,7.876654,6871.0
447,summer,2012,march,0.0,5.0,1.0,2.0,36.278349,53.624422,69.4167,7.792100,8362.0
448,summer,2012,march,0.0,6.0,0.0,2.0,31.617500,48.124378,88.5417,12.916461,3372.0
449,summer,2012,march,0.0,0.0,0.0,2.0,28.562500,44.874208,88.0833,14.791925,4996.0
...,...,...,...,...,...,...,...,...,...,...,...,...
532,summer,2012,june,0.0,6.0,0.0,1.0,37.688349,55.250728,50.4167,11.166689,7702.0
533,summer,2012,june,0.0,0.0,0.0,1.0,35.847500,53.750350,59.8750,9.708568,6978.0
534,summer,2012,june,0.0,1.0,1.0,2.0,34.711651,51.959572,77.7917,11.707982,5099.0
535,summer,2012,june,0.0,2.0,1.0,1.0,40.351651,59.209672,69.0000,9.917139,6825.0


In [159]:
# temperature between 20 and 30
df[(df.temp >= 20) & (df.temp <= 30)]

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0
18,spring,2011,january,0.0,3.0,1.0,2.0,21.732178,35.695852,74.1739,13.957239,1650.0
19,spring,2011,january,0.0,4.0,1.0,2.0,20.298349,32.833300,53.8333,13.125568,1927.0
32,spring,2011,february,0.0,3.0,1.0,2.0,20.220000,32.791522,77.5417,17.708636,1526.0
...,...,...,...,...,...,...,...,...,...,...,...,...
719,winter,2012,december,0.0,4.0,1.0,2.0,23.510000,38.124322,66.7917,8.875021,4128.0
720,spring,2012,december,0.0,5.0,1.0,2.0,23.353349,35.916622,55.6667,25.083661,3623.0
721,spring,2012,december,0.0,6.0,0.0,1.0,20.494151,31.583458,44.1250,27.292182,1749.0
724,spring,2012,december,1.0,2.0,0.0,2.0,21.691288,35.434690,73.4783,11.304642,1013.0


## Adding & Modifying Columns

In [160]:
# we can add columns easily, we can assign constant values (and broadcast it), or vectors with the same length as your dataframe
df["extra_column"] = 0
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,extra_column
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0,0
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0,0
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0,0
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0,0
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0,0
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0,0
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0,0
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0,0


In [161]:
# we can assign the result of vector arithmetics, 
# for example: lets calculate the difference of atemp and temp, and assign it to the temp_diff column
df["temp_diff"] = df.temp - df.atemp
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,extra_column,temp_diff
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0,0,-15.823401
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0,0,-14.263308
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0,0,-11.271622
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0,0,-12.600052
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0,0,-12.464841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0,0,-11.012523
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0,0,-12.926385
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0,0,-12.091749
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0,0,-11.268049


In [162]:
# we can also apply any operation to rows or columns
# lets take the absolute value of temp_diff
# we can pass any function to apply, in this case we are going to pass np.abs, but we can provide any lambda function or function names too
df["temp_diff"] = df["temp_diff"].apply(np.abs)
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,extra_column,temp_diff
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0,0,15.823401
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0,0,14.263308
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0,0,11.271622
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0,0,12.600052
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0,0,12.464841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0,0,11.012523
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0,0,12.926385
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0,0,12.091749
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0,0,11.268049


In [163]:
# example for lambda function
# it will just divide the value with 10
df["temp_diff"] = df["temp_diff"].apply(lambda x: x/10)
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,extra_column,temp_diff
0,spring,2011,january,0.0,6.0,0.0,2.0,24.175849,39.999250,80.5833,10.749882,985.0,0,1.582340
1,spring,2011,january,0.0,0.0,0.0,2.0,25.083466,39.346774,69.6087,16.652113,801.0,0,1.426331
2,spring,2011,january,0.0,1.0,1.0,1.0,17.229108,28.500730,43.7273,16.636703,1349.0,0,1.127162
3,spring,2011,january,0.0,2.0,1.0,1.0,17.400000,30.000052,59.0435,10.739832,1562.0,0,1.260005
4,spring,2011,january,0.0,3.0,1.0,1.0,18.666979,31.131820,43.6957,12.522300,1600.0,0,1.246484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,spring,2012,december,0.0,4.0,1.0,2.0,19.945849,30.958372,65.2917,23.458911,2114.0,0,1.101252
727,spring,2012,december,0.0,5.0,1.0,2.0,19.906651,32.833036,59.0000,10.416557,3095.0,0,1.292639
728,spring,2012,december,0.0,6.0,0.0,2.0,19.906651,31.998400,75.2917,8.333661,1341.0,0,1.209175
729,spring,2012,december,0.0,0.0,0.0,1.0,20.024151,31.292200,48.3333,23.500518,1796.0,0,1.126805


[More examples can be found at the API reference page](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)

## Aggregations & GroupBy

In [164]:
# we can aggrage (reduce) the data with the usual functions like
df.temp.mean()

np.float64(31.28308505991792)

In [165]:
df.temp.std()

np.float64(8.603396817239974)

In [166]:
# we can perform aggregation operations as well
# for example we can calculate the average temperature by seasons

# first we select the temp and season columns
# then we are going to groupby the values by season
# then we are going to reduce each group with the mean function
df[["season", "temp"]].groupby(["season"]).mean()

Unnamed: 0_level_0,temp
season,Unnamed: 1_level_1
fall,41.196537
spring,21.994135
summer,33.587042
winter,27.876584


In [167]:
# we can groupby on multiple levels, so lets include the year too
df[["yr", "season", "temp"]].groupby(["yr", "season"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,temp
yr,season,Unnamed: 2_level_1
2011,fall,40.955161
2011,spring,20.855838
2011,summer,33.109888
2011,winter,28.042887
2012,fall,41.437914
2012,spring,23.119923
2012,summer,34.064196
2012,winter,27.710281


## Exercise

In [170]:
# Exercise 1: Create & Inspect
# Create a DataFrame of 10 students with columns (you can use random data if you wish):
# -> "Name" (string)
# -> "Age" (18–25)
# -> "Grade" (0–100)
# -> "Major" (for example: "CS", "Math", "Physics")
# Inspect: show first 5 rows, summary stats, and column names.
np.random.seed(0)
majors = ["CS", "Math", "Physics"]


data = {
    "Name": ["Jordan Cohen", "Bryce Owens", "Lexi Spence", "Maximus Ramos", "Mckayla Gross", "Wesley Choi", "Talon Fischer", "Bronson Dunlap", "Dana Sullivan", "Isis Chase"],
    "Age": np.random.randint(low=18, high=26, size=10),
    "Grade": np.random.randint(low=0, high=101, size=10),
    "Major": np.random.choice(majors, size=10, replace=True)
}
df = pd.DataFrame(data=data)
df

Unnamed: 0,Name,Age,Grade,Major
0,Jordan Cohen,22,21,Physics
1,Bryce Owens,25,36,CS
2,Lexi Spence,23,87,Math
3,Maximus Ramos,18,70,Math
4,Mckayla Gross,21,88,Math
5,Wesley Choi,21,88,Math
6,Talon Fischer,21,12,CS
7,Bronson Dunlap,25,58,Math
8,Dana Sullivan,19,65,CS
9,Isis Chase,21,39,CS


In [171]:
df.head(5)

Unnamed: 0,Name,Age,Grade,Major
0,Jordan Cohen,22,21,Physics
1,Bryce Owens,25,36,CS
2,Lexi Spence,23,87,Math
3,Maximus Ramos,18,70,Math
4,Mckayla Gross,21,88,Math


In [173]:
df.describe()

Unnamed: 0,Age,Grade
count,10.0,10.0
mean,21.6,56.4
std,2.270585,28.163018
min,18.0,12.0
25%,21.0,36.75
50%,21.0,61.5
75%,22.75,82.75
max,25.0,88.0


In [174]:
df.columns

Index(['Name', 'Age', 'Grade', 'Major'], dtype='object')

In [175]:
# Select the "Name" and "Grade" columns.
df[["Name", "Grade"]]

Unnamed: 0,Name,Grade
0,Jordan Cohen,21
1,Bryce Owens,36
2,Lexi Spence,87
3,Maximus Ramos,70
4,Mckayla Gross,88
5,Wesley Choi,88
6,Talon Fischer,12
7,Bronson Dunlap,58
8,Dana Sullivan,65
9,Isis Chase,39


In [176]:
# Select all students older than 21.
df[df.Age > 21]

Unnamed: 0,Name,Age,Grade,Major
0,Jordan Cohen,22,21,Physics
1,Bryce Owens,25,36,CS
2,Lexi Spence,23,87,Math
7,Bronson Dunlap,25,58,Math


In [177]:
# Select all "Math" majors with grade > 70.
df[(df.Major == "Math") & (df.Grade > 70)]

Unnamed: 0,Name,Age,Grade,Major
2,Lexi Spence,23,87,Math
4,Mckayla Gross,21,88,Math
5,Wesley Choi,21,88,Math


In [181]:
# Add a column "Pass" where grade ≥ 50 → "Yes", else "No".
def passed(row):
    row["Pass"] = "Yes" if row["Grade"] >= 50 else "No"
    return row

df = df.apply(passed, axis=1)
df

Unnamed: 0,Name,Age,Grade,Major,Pass
0,Jordan Cohen,22,21,Physics,No
1,Bryce Owens,25,36,CS,No
2,Lexi Spence,23,87,Math,Yes
3,Maximus Ramos,18,70,Math,Yes
4,Mckayla Gross,21,88,Math,Yes
5,Wesley Choi,21,88,Math,Yes
6,Talon Fischer,21,12,CS,No
7,Bronson Dunlap,25,58,Math,Yes
8,Dana Sullivan,19,65,CS,Yes
9,Isis Chase,21,39,CS,No


In [182]:
# Add a column "AgeGroup": "Teen" if <20, "Adult" otherwise.
def age_group(row):
    row["AgeGroup"] = "Teen" if row["Age"] < 20 else "Adult"
    return row

df = df.apply(age_group, axis=1)
df

Unnamed: 0,Name,Age,Grade,Major,Pass,AgeGroup
0,Jordan Cohen,22,21,Physics,No,Adult
1,Bryce Owens,25,36,CS,No,Adult
2,Lexi Spence,23,87,Math,Yes,Adult
3,Maximus Ramos,18,70,Math,Yes,Teen
4,Mckayla Gross,21,88,Math,Yes,Adult
5,Wesley Choi,21,88,Math,Yes,Adult
6,Talon Fischer,21,12,CS,No,Adult
7,Bronson Dunlap,25,58,Math,Yes,Adult
8,Dana Sullivan,19,65,CS,Yes,Teen
9,Isis Chase,21,39,CS,No,Adult


In [183]:
# Group by "Major" and compute the average grade per major.
df[["Grade", "Major"]].groupby(["Major"]).mean()

Unnamed: 0_level_0,Grade
Major,Unnamed: 1_level_1
CS,38.0
Math,78.2
Physics,21.0


In [184]:
# Count how many students passed in each major.
df[df["Pass"] == "Yes"][["Grade", "Major"]].groupby(["Major"]).count()


Unnamed: 0_level_0,Grade
Major,Unnamed: 1_level_1
CS,1
Math,5


In [188]:
# Sort the DataFrame by grade (descending). Hint: .sort_values, .nlargest, .nsmallest
# Get the top 3 students overall.
df.sort_values("Grade", ascending=False).iloc[:3]

Unnamed: 0,Name,Age,Grade,Major,Pass,AgeGroup
4,Mckayla Gross,21,88,Math,Yes,Adult
5,Wesley Choi,21,88,Math,Yes,Adult
2,Lexi Spence,23,87,Math,Yes,Adult
