In [51]:
# simple linear regression
# steps:
# 1. import libraries
# 2. import dataset
# 3. split dataset into training and test set
# 4. fit simple linear regression to training set
# 5. predict test set results
# 6. visualize training set results
# 7. visualize test set results

# Multiple linear regression
# steps:
# 1. import libraries
# 2. import dataset
# 3. encode categorical data
# 4. split dataset into training and test set
# 5. fit multiple linear regression to training set
# 6. predict test set results
# 7. visualize training set results
# 8. visualize test set results

# Train|Test split
# steps:
# 1. import libraries
# 2. import dataset
# 3. split dataset into training and test set
# 4. fit simple linear regression to training set
# 5. predict test set results
# 6. visualize training set results
# 7. visualize test set results

In [52]:
# Recap:
# 1. we've started with simple linear regression
# 2. then we have to minimize the error by using the least square method (OLS)
# 3. we get the normal equation
# 4. the normal equation works also for multiple linear regression (more features/ more predictors)
# 5. we apply a dimensional analysis to see if it works on multiple linear regression
# 6. we move to the train|test split to avoid data leakage
# 7. after that we do a prediction x_test * estimated parameters
# 8. then we finish by doing an evaluation of the model

In [53]:
a = lambda x: x**2
# a is a function that takes a number and returns the square of that number
# x is the input of the function
# x**2 is the output of the function
a(2) 

4

In [54]:
import numpy as np

# linspace is a function that takes 3 arguments
# 1. start
# 2. stop
# 3. number of points
# linspace returns a list of evenly spaced numbers over a specified interval
b = np.linspace(0, 10, 5) # 0 is the start, 10 is the stop, 5 is the number of points
b

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [55]:
# linalg is a function that takes 2 arguments
# 1. matrix
# 2. vector
# linalg returns the solution to a linear matrix equation
# x = A^-1 * b (-1 is the inverse of the matrix)
# A is the matrix
# b is the vector
# x is the solution
A = np.array([[1, 1], [1.5, 4]])
b = np.array([2200, 5050])
x = np.linalg.solve(A, b)
print(f'A is a {A.shape} matrix')
print(f'b is a {b.shape} vector')
print(f'x is a {x.shape} vector')   
x

A is a (2, 2) matrix
b is a (2,) vector
x is a (2,) vector


array([1500.,  700.])

In [56]:
# how to calculate beta_hat (the estimated parameters) in multiple linear regression using numpy (OLS)
# beta_hat = np.linalg.inv(X.T @ X) @ X.T @ y 
# X is the matrix of features
# y is the vector of dependent variable
# beta_hat is the vector of estimated parameters

In [57]:
# np.dot() is a function that takes 2 arguments
# 1. matrix
# 2. vector
# np.dot() returns the dot product of two arrays

# example:
a = np.array([[1, 2], [3, 4]])
b = np.array([1, 2])

print(f'{np.dot(a, b) = }')
print(f'{np.dot(b, a) = }')
print(f'{np.dot(a, a) = }')
print(f'{np.dot(b, b) = }')

np.dot(a, b) = array([ 5, 11])
np.dot(b, a) = array([ 7, 10])
np.dot(a, a) = array([[ 7, 10],
       [15, 22]])
np.dot(b, b) = 5


In [58]:
# explain np.ones() and np.zeros()
# np.ones() is a function that takes 1 argument
# 1. number of rows
# np.ones() returns a matrix of ones
# example:
a = np.ones(5)
print(f'{a = }')
print(f'{a.shape = }') 
b = np.zeros((5))
print(f'{b = }') 

c = np.c_[a, b]
print(f'{c = }')

a = array([1., 1., 1., 1., 1.])
a.shape = (5,)
b = array([0., 0., 0., 0., 0.])
c = array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])


In [59]:
# reshape a and b to one dimension
a = a.reshape(-1, 1) # -1 means that the number of rows is unknown, but the number of columns is 1
b = b.reshape(-1, 1)
print(f'{a = }')
print(f'{b = }')

# concatenate a and b
c = np.c_[a, b]
print(f'{c = }')

a = array([[1.],
       [1.],
       [1.],
       [1.],
       [1.]])
b = array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])
c = array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])


In [1]:
data = {
    'studentId': [1, 1, 2, 2, 3, 3],
    'subject': ['Math', 'English', 'Math', 'English', 'Math', 'English'],
    'grade': [90, 85, 78, 88, 92, 80]
}
data

{'studentId': [1, 1, 2, 2, 3, 3],
 'subject': ['Math', 'English', 'Math', 'English', 'Math', 'English'],
 'grade': [90, 85, 78, 88, 92, 80]}

In [3]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,studentId,subject,grade
0,1,Math,90
1,1,English,85
2,2,Math,78
3,2,English,88
4,3,Math,92
5,3,English,80


In [8]:
avg_grade = df.groupby('studentId').mean()
avg_grade

  avg_grade = df.groupby('studentId').mean()


Unnamed: 0_level_0,grade
studentId,Unnamed: 1_level_1
1,87.5
2,83.0
3,86.0


In [2]:
import pandas as pd

# Create a small example DataFrame
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
example_df = pd.DataFrame(data)
example_df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [5]:
# Define a simple function that takes two arguments and returns their sum
def add(a, b):
    return a + b

add(1, 4)

5

In [6]:
# Apply the function to each row of the DataFrame using a lambda function and axis=1
example_df['C'] = example_df.apply(lambda row: add(row['A'], row['B']), axis=1) # axis=1 means that we apply the function to each row
example_df

Unnamed: 0,A,B,C
0,1,4,5
1,2,5,7
2,3,6,9
