# Lecture 10

## Introduction to [Regressions](#regression) <a class="anchor" id="TOC"></a>

 - binary means (close vs far)         
 - pre-specified categories with means 
 - non-parametric regression (lowess)  
 - simple linear regression (OLS)  
 - analysis of the results
                                     
#### Case-study:                           
- Hotels Vienna          
                                     
#### Dataset:                              
- hotels-vienna


___

## Introduction to Regression<a class = 'anchor' id = 'regression'></a>

Import packages

In [None]:
import pandas as pd
import numpy as np
from plotnine import *
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")

From OSF import hotel-vienna data

In [None]:
hotels = pd.read_csv("https://osf.io/y6jvb/download")

In [None]:
hotels

Apply filters:  3-4 stars, Vienna actual, without  extreme prices

In [None]:
hotels = (
    hotels.loc[lambda x: x["accommodation_type"] == "Hotel"]
    .loc[lambda x: x["city_actual"] == "Vienna"]
    .loc[lambda x: x["stars"] >= 3]
    .loc[lambda x: x["stars"] <= 4]
    .loc[lambda x: x["stars"].notnull()]
    .loc[lambda x: x["price"] <= 600]
)

Summary statistics on price and distance

In [None]:
hotels.filter(["price", "distance"]).describe(percentiles=[0.25, 0.5, 0.75, 0.95]).T

Graphical investigation: \
create a base scatter-plot between price and distance

In [None]:
p1  = (
    ggplot(data=hotels)
    + geom_point(aes(x="distance", y="price"), color="red", size=2, alpha=0.5)
    + expand_limits(x=0.01, y=0.01)
    + scale_x_continuous(expand=(0.01, 0.01), limits=(0, 7), breaks=range(0, 8))
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401,50)
    )
    + labs(x="Distance to city center (miles)", y="Price (US dollars)")
)
p1

### Binary Variable

Close vs Far away hotels with a binary variable: 
 - if further away from 2 miles, consider as 'far', otherwise 'close'

In [None]:
hotels["dist2"] = np.where(hotels["distance"] >= 2, "Far", "Close")
hotels["Eprice_cat2"] = hotels.groupby("dist2")["price"].transform("mean")

Check the descriptives for the two categories:

In [None]:
(
    hotels.melt(id_vars="dist2", value_vars=["distance", "price"], value_name = "price")
    .groupby(["dist2", "variable"])
    .agg(["mean", "std", "min", "max", "count"])
    .round(2)
)

Plot the two categories

In [None]:
(
    ggplot(data=hotels)
    + geom_point(
        aes(x="dist2", y="Eprice_cat2"), size=5, alpha=0.4, fill="red", na_rm=True
    )
    + geom_text(
        aes(x="dist2", y="Eprice_cat2 + 16" , label="round(Eprice_cat2, 0)"),
        color="black",
        size=10,
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + expand_limits(y=0.01)
    + scale_x_discrete()
    + labs(x="Distance to city center (categories)", y="Average price (US dollars)")
)

#### Task:
Instead of a simple dot, use a box-plot, which shows the underlying (conditional) distribution better!

In [None]:
(
    ggplot(hotels, aes(x="dist2", y="price"))
    + geom_boxplot(color="blue")
    + stat_boxplot(geom="errorbar", width=0.05, size=0.5, color="blue")
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + expand_limits(y=0.01)
    + scale_x_discrete()
    + labs(x="Distance to city center (categories)", y="Average price (US dollars)")
)

###  Pre-spcified Categories With Means (4 Distance Categories)

Below 1, between 1 and 2, between 2 and 3 and above 3 -> x value will be the midpoint

In [None]:
hotels["dist4"] = (
    0.5
    + np.where(hotels["distance"] >= 1, 1, 0)
    + np.where(hotels["distance"] >= 2, 1, 0)
    + np.where(hotels["distance"] >= 3, 2.5, 0)
)

Add mean values for price given each group

In [None]:
hotels["Eprice_cat4"] = hotels.groupby("dist4")["price"].transform("mean")

In [None]:
(
    hotels.melt(id_vars="dist4", value_vars=["distance", "price"], value_name = "price")
    .groupby(["dist4", "variable"])
    .agg(["mean", "std", "min", "max", "count"])
    .round(2)
)

Make a graph for each segment

In [None]:
(
    ggplot(data=hotels)
    + geom_point(
        aes(x="dist4", y="Eprice_cat4"), size=2.5, fill="red", alpha=0.4, na_rm=True
    )
    + geom_text(
        aes(x="dist4", y="Eprice_cat4 + 15", label="round(Eprice_cat4)"),
        color="black",
        size=10,
    )
    + expand_limits(x=0.01, y=0.01)
    + coord_cartesian(xlim=(0, 7), ylim=(0, 400))
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 7), breaks=[0, 1, 2, 3, 4, 5, 6, 7]
    )
    + labs(x="Distance to city center (miles)", y="Price (US dollars)")
)

What actually is happening is a: \
scatterplot with step function (we use 1km bits for simplicity using 4 bits for 3-7km)

In [None]:
hotels["dist4_s"] = (
    np.where(hotels["distance"] >= 1, 1, 0)
    + np.where(hotels["distance"] >= 2, 1, 0)
    + np.where(hotels["distance"] >= 3, 1, 0)
    + np.where(hotels["distance"] >= 4, 1, 0)
    + np.where(hotels["distance"] >= 5, 1, 0)
    + np.where(hotels["distance"] >= 6, 1, 0)
)

hotels["xend"] = hotels["dist4_s"] + 1
hotels["yend"] = hotels["Eprice_cat4"]

In [None]:
p1 + geom_segment(
    aes(x="dist4_s", y="yend", xend="xend", yend="yend"),
    color="blue",
    size=0.7,
    na_rm=True,
)

#### Task

REGRESSION 3: use 7 different categories/bins based on distance: \
below 1, between:
          1 and 2
          2 and 3
          ...
          6 and 7

In [None]:
hotels["dist7_new"] = (
    0.5
    + np.where(hotels["distance"] >= 1, 1, 0)
    + np.where(hotels["distance"] >= 2, 1, 0)
    + np.where(hotels["distance"] >= 3, 1, 0)
    + np.where(hotels["distance"] >= 4, 1, 0)
    + np.where(hotels["distance"] >= 5, 1, 0)
    + np.where(hotels["distance"] >= 6, 1, 0)
)

In [None]:
hotels["Eprice_cat7_new"] = hotels.groupby("dist7_new")["price"].transform("mean")

In [None]:
(
    hotels.melt(id_vars="dist7_new", value_vars=["distance", "price"], value_name = "price")
    .groupby(["dist7_new", "variable"])
    .agg(["mean", "std", "min", "max", "count"])
    .round(2)
)

Scatterplot with step function, starting point is simply at cut-off

In [None]:
hotels["dist7_s"] = (
    np.where(hotels["distance"] >= 1, 1, 0)
    + np.where(hotels["distance"] >= 2, 1, 0)
    + np.where(hotels["distance"] >= 3, 1, 0)
    + np.where(hotels["distance"] >= 4, 1, 0)
    + np.where(hotels["distance"] >= 5, 1, 0)
    + np.where(hotels["distance"] >= 6, 1, 0)
)

In [None]:
hotels["xend"] = hotels["dist7_s"] + 1
hotels["yend"] = hotels["Eprice_cat7_new"]

In [None]:
p1 + geom_segment(
    aes(x="dist7_s", y="yend", xend="xend", yend="yend"),
    color="blue",
    size=0.7,
    na_rm=True,
)

###  Lowess Non-parametric Regression

In [None]:
p1 + geom_smooth(aes(x="distance", y="price"), color="blue", method="loess", se=False)

- Advantage: \
   smooth curve which represent the pattern of association pretty flexibly!
-  Disadvantage: \
   no measurable properties: it smooth over the observations with an 'optimal' bandwidth.

### Simple Linear Regression

In [None]:
p1 + geom_smooth(
    aes(x="distance", y="price"), method="lm", color="blue", formula="y~x", se=False
)

How to quantify linear regression: \
Remember: $y = \alpha + \beta * x + \epsilon$

In Python, the [statsmodels](https://www.statsmodels.org/stable/index.html) package is usually used to estimate regressions

In [None]:
import statsmodels.formula.api as smf
from mizani.formatters import percent_format

We use the statsmodels formula api, where you can give the equations as a string

 Simple model, with homoskedastic SE

In [None]:
simple_reg = smf.ols("price ~ distance", data=hotels).fit()
print(simple_reg.summary())

Simple model, with heteroskedastic robust SE

In [None]:
hetero_rob_reg = smf.ols("price ~ distance", data=hotels).fit(cov_type ="HC3")
print(hetero_rob_reg.summary())

Its wasy to compare two regression output tables using the stargazer package

In [None]:
from stargazer.stargazer import Stargazer

In [None]:
table = Stargazer([simple_reg, hetero_rob_reg])
table.rename_covariates({"Intercept": "Constant"})
table.custom_columns(["Homoskedastic SE", "Heteroskedastic robust SE"], [1, 1])
table

### Analysis Of Rhe Results
   - price prediction of a model
   - errors of predictions

It is easy to save the predicted values and residuals

In [None]:
hotels["predprice"] = simple_reg.fittedvalues
hotels["e"] = simple_reg.resid

Get the hotel, which is the most underpriced

In [None]:
hotels.sort_values(by="e").head(1)

probably we are only interested in hotel_id, distance, price, prediction and error values:

In [None]:
hotels.sort_values(by="e").head(1).filter(["hotel_id","distance","price","predprice","e"])

 Interpret the result!

We can get the 5 most overpriced five hotels

In [None]:
hotels.sort_values(by="e", ascending=False).head(5).filter(
    ["hotel_id", "distance", "price", "predprice", "e"]
)

Checking the histogram of residuals:\
we can better understand about how well we can predict the prices\
notes:\
   - we picked previously the smallest and 5 largest values from here
   - on average we will have 0 error, as this is a property of the OLS estimator

In [None]:
(
    ggplot(hotels, aes(x="e"))
    + geom_histogram(
        aes(y=after_stat("count / np.sum(count)")),
        binwidth=20,
        fill="blue",
        color="white",
        size=0.2,
        alpha=0.8,
        na_rm=True,
    )
    + labs(x="Residuals", y="Percent")
    + scale_x_continuous(limits=(-100, 300), breaks=np.arange(-100, 301, 100))
    + scale_y_continuous(
        expand=(0.0, 0.0),
        limits=(0, 0.31),
        breaks=np.arange(0, 0.31, 0.05),
        labels=percent_format(),
    )
    + theme_bw()
)

We can make a pretty graph with the bottom and top 5 deals:

Create a factor variable with 4 possible values

In [None]:
hotels["reg1_res"] = np.where(hotels["e"] >= 0, "overpriced", "underpriced")

hotels["reg1_res"] = np.where(
    hotels["hotel_id"].isin(
        hotels.sort_values(by="e", ascending=False).head(5)["hotel_id"].tolist()
    ),
    "top5",
    hotels["reg1_res"],
)
hotels["reg1_res"] = np.where(
    hotels["hotel_id"].isin(
        hotels.sort_values(by="e", ascending=False).tail(5)["hotel_id"].tolist()
    ),
    "bottom5",
    hotels["reg1_res"],
)

In [None]:
(
    ggplot(hotels, aes(x="distance", y="price"))
    + geom_point(aes(color="reg1_res"), alpha=0.6)
    + geom_point(
        hotels.loc[lambda x: x["reg1_res"] == "top5"], alpha=0.8, size=8, color="blue"
    )
    + geom_point(
        hotels.loc[lambda x: x["reg1_res"] == "bottom5"],
        alpha=0.8,
        size=10,
        color="red",
        fill="white",
    )
    + geom_smooth(method="lm", size=1, se=False, formula="y~x")
    + coord_cartesian(xlim=(0, 7), ylim=(0, 400))
    + expand_limits(x=0.01, y=0.01)
    + scale_color_discrete(guide=False)
    + scale_x_continuous(expand=(0.01, 0.01), limits=(0, 7), breaks=np.arange(0, 8, 1))
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + labs(x="Distance to city center (miles)", y="Price (US dollars)")
    + geom_segment(aes(x=2, y=25, xend=1.15, yend=50), arrow=arrow())
    + annotate("text", x=3, y=25, label="Most underpriced hotels", size=8)
    + theme_bw()
)