Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression

data=pd.read_csv('Lecture5.csv')

data=pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/HG4054%20Language%20and%20Society%20Through%20Data%20Analytics/Lecture5.csv')

Setting display options

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',10)
pd.set_option('display.width', 1000)

Visualizing the data

In [None]:
sns.lmplot(data,y="Happiness",x="Life_exp")

sns.stripplot(data,y="Happiness",x="Democracy", color='black')
sns.barplot(data,y="Happiness",x="Democracy")


Regression nodels with STATSMODELS

In [None]:
#Simple regression. The part in green is called the 'formula' of the regression
model1 = ols('Happiness ~ Life_exp', data).fit()
model1.summary()

model2 = ols('Happiness ~ Democracy',data).fit()
model2.summary()

data.groupby('Democracy')['Happiness'].mean()

#Multiple regression
sns.lmplot(data,y='Happiness',x='Life_exp', hue='Democracy')

#Option 1: Don't model the interaction. Use the + sign.
model3 = ols('Happiness ~ Life_exp + Democracy',data).fit()
model3.summary()

#Option 2: Model the interaction. Use the * sign.
model4 = ols('Happiness ~ Life_exp * Democracy',data).fit()
model4.summary()


#OPTIONAL. Show that the slope of the interaction term is the difference in the life_exp slope between Demo and non-demos
non_demo= ols('Happiness ~ Life_exp', data.loc[data['Democracy']==0]).fit()  #fit model1 with non-demos only
non_demo.summary()

demo= ols('Happiness ~ Life_exp', data.loc[data['Democracy']==1]).fit() #fit model1 with demos only
demo.summary()

#calculate slope difference, which is the same as the slope of model4
demo.params['Life_exp'] - non_demo.params['Life_exp']



In-sample predictions using fitted models

In [None]:
#Predicting outcomes using the best model
model4.predict(data)
data['Predicted_H']=model4.predict(data)
data[['Country','Happiness','Predicted_H']]

#Evaluate accuracy with visualizations
sns.scatterplot(data, y='Predicted_H', x='Happiness')
plt.plot([min(data['Happiness']), max(data['Happiness'])], [min(data['Happiness']), max(data['Happiness'])], color='red')

#Evaluate accuracy with MAPE (Mean Absolute Percentage Error)
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(data['Happiness'], data['Predicted_H'])



Optional: Residual diagnostics

In [None]:
#this is simply the observed values - predicted values
model4.resid

#visualizing mean of residuals
sns.lineplot(model4.resid)
plt.axhline(0, color='red')

#visualizing distribution of residuals
sns.histplot(model4.resid)


Optional: Regression with SCIKIT-LEARN

In [None]:
from sklearn.linear_model import LinearRegression

#Model 1
X=data[['Life_exp']] #Define predictors and outcome
y=data[['Happiness']]

model1a = LinearRegression()
model1a.fit(X, y)
model1a.coef_  #slope(s) of predictor(s)
model1a.intercept_  #intercept


#Model 3
X=data[['Life_exp','Democracy']]  #Define predictors and outcome
y=data[['Happiness']]

model3a = LinearRegression()
model3a.fit(X, y)
model3a.coef_  #slope(s) of predictor(s)
model3a.intercept_  #intercept


#Model 4
data['X1X2']= data['Life_exp'] * data['Democracy']  #Need to create the interaction term first

X=data[['Life_exp','Democracy','X1X2']]
y=data[['Happiness']]

model4a = LinearRegression()
model4a.fit(X, y)
model4a.coef_  #slope(s) of predictor(s)
model4a.intercept_  #intercept

SEMINAR 5

In [None]:
data=pd.read_csv('Seminar5.csv')

data=pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/HG4054%20Language%20and%20Society%20Through%20Data%20Analytics/Seminar5.csv')


sns.scatterplot(data, y='Test', x='TV', hue='Age')

model1 = ols('Test ~ TV', data).fit()
model1.summary()


model2 = ols('Test ~ Age', data).fit()
model2.summary()

#multivariate model has higher R2
#slope coefficient of TV changes from + to -. Controlling for age, the negative influence of TV
#is revealed. This is an example of Simpson's paradox where an overall pattern in a sample is reversed in a sub-group
model3 = ols('Test ~ TV + Age', data).fit()
model3.summary()

#interaction effect is not significant
model4 = ols('Test ~ TV * Age', data).fit()
model4.summary()
