### **Exercise 2.01: Loading and preparing the data for analysis**

In [None]:
# Import necessary modules

%matplotlib inline
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import pandas as pd
import numpy as np
import patsy
from statsmodels.graphics.correlation import plot_corr
from sklearn.model_selection import train_test_split
plt.style.use('seaborn')

In [None]:
# Load the dataset into a pandas dataframe

rawBostonData = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter02/Dataset/Boston.csv')

In [None]:
# Inspect the dataframe

rawBostonData.head() 

In [None]:
# Drop missing values from the dataframe

rawBostonData = rawBostonData.dropna()

In [None]:
# Drop duplicate records from the dataframe

rawBostonData = rawBostonData.drop_duplicates()

In [None]:
# List the dataframe column names

list(rawBostonData.columns)

In [None]:
# Rename the dataframe column headings

renamedBostonData = rawBostonData.rename(columns = {'CRIM':'crimeRatePerCapita',
 ' ZN ':'landOver25K_sqft',
 'INDUS ':'non-retailLandProptn',
 'CHAS':'riverDummy',
 'NOX':'nitrixOxide_pp10m',
 'RM':'AvgNo.RoomsPerDwelling',
 'AGE':'ProptnOwnerOccupied',
 'DIS':'weightedDist',
 'RAD':'radialHighwaysAccess',
 'TAX':'propTaxRate_per10K',
 'PTRATIO':'pupilTeacherRatio',
 'LSTAT':'pctLowerStatus',
 'MEDV':'medianValue_Ks'})

In [None]:
# Inspect the types of data in the dataframe

renamedBostonData.info()

In [None]:
# Calculate basic statistics using the values in the dataframe

renamedBostonData.describe(include=[np.number]).T

In [None]:
# Split the data set into training and test sets

X = renamedBostonData.drop('crimeRatePerCapita', axis = 1)
y = renamedBostonData[['crimeRatePerCapita']]
seed = 10 
test_data_size = 0.3 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_data_size, random_state = seed)
train_data = pd.concat([X_train, y_train], axis = 1)
test_data = pd.concat([X_test, y_test], axis = 1)

In [None]:
# Create and plot a correlation matrix

corrMatrix = train_data.corr(method = 'pearson')
xnames=list(train_data.columns)
ynames=list(train_data.columns)
plot_corr(corrMatrix, xnames=xnames, ynames=ynames,\
          title=None, normcolor=False, cmap='RdYlBu_r')

### **Exercise 2.02: Graphical investigation of linear relationships using Python**





In [None]:
# Use the seaborn function regplot to create a scatter plot and fit a regression line through it

fig, ax = plt.subplots(figsize=(10, 6))
sns.regplot(x='medianValue_Ks', y='crimeRatePerCapita', ci=None,
data=train_data, ax=ax, color='k', scatter_kws={"s": 20,"color":\
"royalblue", "alpha":1})
ax.set_ylabel('Crime rate per Capita', fontsize=15, fontname='DejaVu Sans')
ax.set_xlabel("Median value of owner-occupied homes in $1000's",\
fontsize=15, fontname='DejaVu Sans')
ax.set_xlim(left=None, right=None)
ax.set_ylim(bottom=None, top=30)
ax.tick_params(axis='both', which='major', labelsize=12)
fig.tight_layout()

### **Exercise 2.03: Examining a possible log-linear relationship using Python**

In [None]:
# Use the seaborn function regplot to create a log-linear plot and fit a regression line through it

fig, ax = plt.subplots(figsize=(10, 6))
y = np.log(train_data['crimeRatePerCapita'])
sns.regplot(x='medianValue_Ks', y=y, ci=95, data=train_data, ax=ax,\
color='k', scatter_kws={"s": 20,"color": "royalblue", "alpha":1})
ax.set_ylabel('log of Crime rate per Capita', fontsize=15,\
fontname='DejaVu Sans')
ax.set_xlabel("Median value of owner-occupied homes in $1000's",\
fontsize=15, fontname='DejaVu Sans')
ax.set_xlim(left=None, right=None)
ax.set_ylim(bottom=None, top=None)
ax.tick_params(axis='both', which='major', labelsize=12)
fig.tight_layout()

### **Exercise 2.04: Fit a simple linear regression model using the Statsmodels formula API**

In [None]:
# Use the statsmodels API to create a simple linear regression

linearModel = smf.ols(formula='crimeRatePerCapita ~ medianValue_Ks',\
data=train_data)
linearModelResult = linearModel.fit()
print(linearModelResult.summary())

### **Activity 2.01: Fit a log-linear model using the Statsmodels formula API**

In [None]:
# Use the statsmodels API to create a log-linear regression model

logLinearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ medianValue_Ks',\
data=train_data)
logLinearModResult = logLinearModel.fit()
print(logLinearModResult.summary())

### **Exercise 2.05: Fit a multiple linear regression model using the Statsmodels formula API**

In [None]:
# Use the statsmodels API to create a multiple linear regression model

multiLinearModel = smf.ols(formula=\
'crimeRatePerCapita ~ pctLowerStatus + radialHighwaysAccess +\
medianValue_Ks + nitrixOxide_pp10m', data=train_data)
multiLinearModResult = multiLinearModel.fit()
print(multiLinearModResult.summary())

### **Activity 2.02: Fit a multiple log-linear regression model**

In [None]:
# Use the statsmodels API to create a multiple log-linear regression model

multiLogLinMod = smf.ols(formula=\
'np.log(crimeRatePerCapita) ~ \
(pctLowerStatus + radialHighwaysAccess + medianValue_Ks + nitrixOxide_pp10m)**2',\
data=train_data)
multiLogLinModResult = multiLogLinMod.fit()
print(multiLogLinModResult.summary())