# Summary of DAS

This notebook contains a list of Python methods that were introduced in the DAS programming exercises.

### Import the libraries

In [None]:
import numpy as np  # import auxiliary library, typical idiom
import pandas as pd  # import the Pandas library, typical idiom

# next command ensures that plots appear inside the notebook
%matplotlib inline
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression  # for linear regression
sns.set()  # set Seaborn defaults
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

In [None]:
import statsmodels.api as sm

# <span class="section">1.</span> Hypothesis testing for equality of means

In [None]:
df_michelson = pd.read_csv("../datasets/light_michelson.csv")
df_michelson.head()

In [None]:
df_newcomb = pd.read_csv("../datasets/light_newcomb.csv")
df_newcomb.head()

## Step 1: Define quantitative question


## Step 2: Formulate hypothesis


## Step 3: Check assumptions


### Kernel density plot

In [None]:
plt.rcParams['figure.figsize'] = 8, 8  # square plots

In [None]:
df_newcomb.plot(kind='density');

### Q-Q plot


In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, sharey=True, squeeze=False)
sm.qqplot(df_newcomb['Speed [km/s]'], fit=True, line='45', ax=ax[0,0])
ax[0,0].set_title('Q-Q plot for Newcombs measurements');

### Anderson-Darling normality test


In [None]:
sm.stats.normal_ad(df_newcomb['Speed [km/s]'])

## Step 4: Apply appropriate test

### One sample

In [None]:
d_newcomb = sm.stats.DescrStatsW(df_newcomb['Speed [km/s]'])

In [None]:
d_newcomb.ttest_mean(299792.458, alternative='two-sided')

In [None]:
d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')

### Two sample

In [None]:
d_newcomb = sm.stats.DescrStatsW(df_newcomb['Speed [km/s]'])
d_michelson = sm.stats.DescrStatsW(df_michelson['Speed [km/s]'])

In [None]:
cm = sm.stats.CompareMeans(d_newcomb, d_michelson)

In [None]:
cm.ttest_ind(alternative='two-sided')

In [None]:
cm.tconfint_diff(alpha=0.05, alternative='two-sided')

## Step 5: Decision


# <span class="section">2.</span> Hypothesis testing on proportions

In [None]:
students = 100
passes = 88

students_previous = 89
passes_previous = 70

## Step 1: Define quantitative question


## Step 2: Formulate hypothesis


## Step 3: Check assumptions


## Step 4: Apply appropriate test

In [None]:
ztest_exam = sm.stats.proportions_ztest([passes, passes_previous], [students, students_previous], alternative='larger')
ztest_exam

In [None]:
ztest_exam[1]

## Step 5: Decision


# <span class="section">3.</span> Performing diagnostics on regression results

In [None]:
df_forbes= pd.read_csv("../datasets/water.csv")

In [None]:
ax = df_forbes.plot(kind='scatter', x='bp', y='pres', c='b')
ax.set_xlabel('Boiling point (in degrees Celcius)')
ax.set_ylabel('Pressure (in inches of mercury)')
ax.set_title("Pressure vs boiling point of water", fontsize=16);

In [None]:
X = df_forbes[['bp']]
y = df_forbes['pres']
linear_model = LinearRegression()
linear_model.fit(X, y);

In [None]:
sns.regplot(df_forbes['bp'], df_forbes['pres']);

#### Model evaluation


In [None]:
linear_model.score(X, y)

In [None]:
y_pred = linear_model.predict(X)
residuals = (y - y_pred)

In [None]:
from sklearn.preprocessing import scale

residuals_norm = scale(residuals)

In [None]:
df_forbes_vis = df_forbes.copy()
df_forbes_vis['residuals'] = residuals_norm

ax = df_forbes_vis.plot(kind='scatter', x='bp', y='residuals', c='b')
ax.set_xlabel('Boiling point (in degrees Celcius)')
ax.set_ylabel('Normalized residuals')
ax.set_title('Normalized residuals for pressure', fontsize=16);

In [None]:
y_log = np.log(df_forbes['pres'])
linear_model_log = LinearRegression()
linear_model_log.fit(X, y_log);

In [None]:
y_log_pred = linear_model_log.predict(X)
residual_log = scale(y_log - y_log_pred)

df_forbes_log_vis = df_forbes.copy()
df_forbes_log_vis['residuals'] = residual_log

ax = df_forbes_log_vis.plot(kind='scatter', x='bp', y='residuals', c='b')
ax.set_xlabel('Boiling point')
ax.set_ylabel('Normalized residuals')
ax.set_title('Normalized residuals for logarithm of pressure', fontsize=16);