<a href="https://colab.research.google.com/github/emolinaperez/econometrics_mek/blob/main/Week%202/Office_Hours/Lab%201%20Horas%20de%20Oficina.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 1: Econometrics | Office Hours

In [None]:
# load packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

import seaborn as sns
from scipy.optimize import minimize

In [None]:
# Generate example data
np.random.seed(42)
n_obs = 100
x = np.linspace(0, 10, n_obs)
epsilon = np.random.normal(0, 1, n_obs)
y = 2 * x + 1 + epsilon

In [None]:
epsilon

In [None]:
x

In [None]:
y

In [None]:
# Create a DataFrame
data = pd.DataFrame({'y': y, 'x': x})

In [None]:
data.head()

In [None]:
# Fit linear regresion
X = sm.add_constant(data['x'])  # Add the constant for the intercept
model = sm.OLS(data['y'], X).fit()

In [None]:
# Regression results
print(model.summary())

In [None]:
# linear regresion plot
plt.scatter(x, y, label='Datos')
plt.plot(x, model.predict(X), color='red', label='Regresión lineal')
plt.xlabel('Variable Independiente (x)')
plt.ylabel('Variable Dependiente (y)')
plt.legend()
plt.show()

In [None]:
# Model residuals
residuals = model.resid

# Residuals plot
plt.scatter(data['x'], residuals, label='Residuos')
plt.axhline(y=0, color='black', linestyle='--', label='Línea base')
plt.xlabel('Variable Independiente (x)')
plt.ylabel('Residuos')
plt.legend()
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = data.corr()
print("Correlation matrix:")
print(correlation_matrix)

# Simple Linear Regression

This is a simple linear regression model as in every econometrics textbooks where
 is dependent variable,
 is independent variable and
 is disturbance term.
 and
 are unknown parameters that we are aiming to estimate by feeding the data in the model. Without disturbance term, the model is simple a function of a straight line in
, such as

In the context of machine learning (ML), the
 is usually called feature variable and
 called target variable. And linear regression is the main tool in supervised learning, meaning that
 is supervising
.

# Simple Linear Regression

This is a simple linear regression model, as in every econometrics textbook, where:  
- \( y \) is the dependent variable,  
- \( x \) is the independent variable, and  
- \( \epsilon \) is the disturbance term.

\( \beta_0 \) and \( \beta_1 \) are unknown parameters that we aim to estimate by feeding the data into the model. Without the disturbance term, the model is simply a function of a straight line in \( x \), such as:

\[
y = \beta_0 + \beta_1 x + \epsilon
\]

In the context of machine learning (ML):  
- \( x \) is usually called the **feature variable**, and  
- \( y \) is called the **target variable**.  

Linear regression is a key tool in supervised learning, meaning that \( y \) supervises the process.


In [None]:
X = np.linspace(1, 10, 10)
Y = 2 + 3*X
print(X)
print(Y)

In [None]:
fig, ax = plt.subplots(figsize = (7, 7))
ax.plot(X, Y)
ax.scatter(X, Y, c ='r')
ax.grid()
ax.set_title('$Y=2+3x$')
ax.set_xlim(0, 10)
ax.set_ylim(0, 40)
plt.show()

This is a simple linear regression model as in every econometrics textbooks
where
 is dependent variable,
 is independent variable and
 is disturbance term.
 and
 are unknown parameters that we are aiming to estimate by feeding the data in the model. Without disturbance term, the model is simple a function of a straight line in
, such as

In the context of machine learning (ML), the
 is usually called feature variable and
 called target variable. And linear regression is the main tool in supervised learning, meaning that
 is supervising
.

There are five reasons justified that we need a disturbance term:

1. omission of independent variables
2. aggregation of variables
3. model misspecification
4. function misspecification, eg. should be nonlinear rather than linear
5. measurement error

The second one means that if we intend to aggregate the variable to a macro level, for instance every family has a consumption function, but aggregation on a national level causes discrepancies which contribute to the disturbance term.

The third and forth one will be discussed in details in later chapter.

The fifth one includes all types of error, man-made or natural.

Odinary Least Squares
Odinary Least Squares is the most common estimation technique used in ML or econometrics, it is popular due to its simplicity and transparency. You'll be able to derive the whole estimation process by hand-calculation, all steps will have closed-form expression.

We'll demonstrate OLS with our first plot. Every time you run this script, the result will be different than mine, because no random seeds are set.

In [None]:
beta1, beta2 = 2, 3
def gen_linreg_data(beta1, beta2, samp_size, disturb_scale):

    X = np.linspace(1, 10, samp_size)
    u = disturb_scale * np.random.randn(samp_size)
    Y = beta1 + beta2*X + u
    Y_hat = beta1 + beta2*X
    return X, Y, Y_hat

def plot_lin_reg(X, Y, Y_hat):
    fig, ax = plt.subplots(figsize = (7, 7))

    for i in range(len(Y)):
        dot_fit_values = [X[i], X[i]]
        dot_org_values = [Y[i], Y_hat[i]]
        ax.plot(dot_fit_values, dot_org_values, linestyle = '--', color = 'red', label = 'residual')

    ax.plot(X, Y_hat)
    ax.scatter(X, Y_hat, c = 'k')
    ax.scatter(X, Y, c ='r')
    ax.grid()
    ax.set_title('$\hat Y ={}+{}X$'.format(beta1, beta2))
    plt.show()

if __name__ == '__main__':
    X, Y, Y_hat = gen_linreg_data(beta1=beta1, beta2=beta2, samp_size=10, disturb_scale=5)
    plot_lin_reg(X, Y, Y_hat)

In [None]:
X, Y, Y_hat = gen_linreg_data(beta1=4, beta2=2, samp_size=15, disturb_scale=3)

In [None]:
class S_OLS:
    '''Create instances with S_OLS(X, Y), where X and Y are data array.'''
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def ols(self):
        '''Estimate the data with OLS method, and return b1 and b2.'''
        cov_mat = np.cov(self.X, self.Y)
        self.b2 = cov_mat[0, 1]/cov_mat[0, 0]
        self.b1 = np.mean(self.Y) - self.b2*np.mean(self.X)
        self.Y_hat = self.b1 + self.b2*self.X
        print('b1 estimate: {:.4f}'.format(self.b1))
        print('b2 estimate: {:.4f}'.format(self.b2))
        return self.Y_hat, self.b2, self.b1

    def simul_plot(self, beta1, beta2):
        '''Plot scatter plot and fitted line with ols_plot(self, beta1, beta2),
        beta1 and beta2 are parameters of data generation process.'''
        fig, ax = plt.subplots(figsize = (7, 7))
        for i in range(len(Y)):
            dot_fit_values = [self.X[i], self.X[i]]
            dot_org_values = [self.Y[i], self.Y_hat[i]]
            ax.plot(dot_fit_values, dot_org_values, linestyle = '--', color = 'red')
        ax.scatter(self.X, self.Y_hat, c = 'k')
        ax.scatter(self.X, self.Y, c ='r')
        ax.plot(self.X, self.Y_hat, label = '$b_1$= {:.2f}, $b_2$={:.2f}'.format(b1, b2))
        ax.grid()
        ax.set_title('$\hat Y ={:.2f}+{:.2f}X$'.format(b1, b2))
        Y_hat_perfect = beta1 + beta2*X
        ax.plot(X, Y_hat_perfect, label = r'$\beta_1=2, \beta_2=3$')
        ax.legend()
        plt.show()

    def ols_plot(self, xlabel, ylabel):
        self.xlabel = xlabel
        self.ylabel = ylabel
        fig, ax = plt.subplots(figsize=(7, 7))

        # Plot observed values (red) and fitted values (blue)
        ax.scatter(self.X, self.Y, c='r', label='Observed Values (Y)')
        ax.scatter(self.X, self.Y_hat, c='b', label='Fitted Values (Ŷ)')

        # Plot regression line
        ax.plot(self.X, self.Y_hat, label=f'Regression Line ($b_1$ = {self.b1:.2f}, $b_2$ = {self.b2:.2f})', color='black')

        # Add grid, title, labels, and legend
        ax.grid()
        ax.set_title(f'$\\hat Y = {self.b1:.2f} + {self.b2:.2f}X$', fontsize=14)
        ax.set_xlabel(self.xlabel, fontsize=12)
        ax.set_ylabel(self.ylabel, fontsize=12)
        ax.legend()  # Add legend to indicate what each color represents

    def r_sq(self):
        '''Calculate coefficient of determination and correlation of Y and Yhat'''
        self.ESS = np.var(self.Y_hat)
        self.RSS = np.var(self.Y-self.Y_hat)
        self.R_sq = self.ESS/self.RSS
        return self.ESS, self.RSS, self.R_sq

In [None]:
df = pd.read_excel('../data/Basic_Econometrics_practice_data.xlsx',
                   sheet_name = 'CN_Cities_house_price')
df.head()

In [None]:
s_ols_house_income = S_OLS(df['salary'], df['house_price'])
Y_hat, b2, b1 = s_ols_house_income.ols()

In [None]:
model1 = smf.ols('house_price ~ salary ', data=df).fit()
print(model1.summary())

In [None]:
s_ols_house_income.ols_plot('Disposable Income', 'House Price')

In [None]:
epsilon = df['house_price'] - Y_hat
np.mean(epsilon)

In [None]:
print('Mean of Y hat: {}'.format(np.mean(Y_hat)))
print('Mean of Y: {}'.format(np.mean(df['house_price'])))