# Introduction to "R" in Python

In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import polars as pl
import matplotlib.pyplot as plt

# Load USArrests dataset
df = sm.datasets.get_rdataset("USArrests").data
df = pl.DataFrame(df)

df

Murder,Assault,UrbanPop,Rape
f64,i64,i64,f64
13.2,236,58,21.2
10.0,263,48,44.5
8.1,294,80,31.0
8.8,190,50,19.5
9.0,276,91,40.6
…,…,…,…
8.5,156,63,20.7
4.0,145,73,26.2
5.7,81,39,9.3
2.6,53,66,10.8


## 1. Identify the variables and determine the total number of observations

In [2]:
print("Variables: \n\t{}".format(df.columns))
print("Number of observations: {}".format(df.shape[0]))


Variables: 
	['Murder', 'Assault', 'UrbanPop', 'Rape']
Number of observations: 50


## 2. Compute the mean number of assaults and the mean \# of murders per 100,000 people


In [3]:
print("Mean assaults: {:.2f}".format(df["Assault"].mean()))
print("Mean murders per 100,000 people: {:.2f}".format(df["Murder"].mean()))


Mean assaults: 170.76
Mean murders per 100,000 people: 7.79


## 3. Determine the mean number of murders for states where more than 65% of the population lives in urban areas


In [4]:
subset_df = df.filter(
    pl.col("UrbanPop") > 65
)
print("Mean number of murders in subset: {:.2f}".format(subset_df["Murder"].mean()))


Mean number of murders in subset: 7.83


## 4. Conduct two simple OLS regressions

In the first regression, use “Murder” as the dependent variable
and “UrbanPop” as the independent variable. In the second regression, add “Assault” and “Rape” as
additional independent variables. Display both regressions side-by-side in a single regression table.



In [12]:
reg1 = smf.ols(
    formula='Murder ~ UrbanPop', data=df.to_pandas()
).fit()
reg1_str = reg1.summary()
# print(str(reg1.summary()))

reg2 = smf.ols(
    formula='Murder ~ UrbanPop + Assault', data=df.to_pandas()
).fit()
reg2_str = reg2.summary()

reg1_str + reg2_str


TypeError: unsupported operand type(s) for +: 'Summary' and 'Summary'