In [1]:
# Imports
import numpy as np
import pandas as pd

In [2]:
data = pd.DataFrame({
    'x0': [1, 2, 3, 4, 5],
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2.]})
print(data, type(data))
print(data.columns, '\n')

# convert a DataFrame into a NumPy array using '.values' property
print(data.values, type(data.values), '\n') 

# convert a NumPy array to a DataFrame. Also uses optional column names
df2 = pd.DataFrame(data.values, columns = ['One', 'two', 'three'])
print(df2, type(df2))

   x0    x1    y
0   1  0.01 -1.5
1   2 -0.01  0.0
2   3  0.25  3.6
3   4 -4.10  1.3
4   5  0.00 -2.0 <class 'pandas.core.frame.DataFrame'>
Index(['x0', 'x1', 'y'], dtype='object') 

[[ 1.    0.01 -1.5 ]
 [ 2.   -0.01  0.  ]
 [ 3.    0.25  3.6 ]
 [ 4.   -4.1   1.3 ]
 [ 5.    0.   -2.  ]] <class 'numpy.ndarray'> 

   One   two  three
0  1.0  0.01   -1.5
1  2.0 -0.01    0.0
2  3.0  0.25    3.6
3  4.0 -4.10    1.3
4  5.0  0.00   -2.0 <class 'pandas.core.frame.DataFrame'>


In [3]:
# If array has mixed data types '.value' will return an 'ndarray' of objects
df3 = data.copy() # preserve 'data' DataFrame
df3['strings'] = ['a', 'b', 'c', 'd', 'e'] # Add 'strings' column
print(df3, type(df3), '\n')
df3.values

   x0    x1    y strings
0   1  0.01 -1.5       a
1   2 -0.01  0.0       b
2   3  0.25  3.6       c
3   4 -4.10  1.3       d
4   5  0.00 -2.0       e <class 'pandas.core.frame.DataFrame'> 



array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [4]:
model_cols = ['x0', 'x1']
print(data.loc[:, model_cols].values, '\n') # Extract a subset of the columns

# Creating dummy variables in a DataFrame containing mixed data types
# Add a Categorical column to 'data' DataFrame
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'], 
                                  categories = ['a', 'b'])
print(data, '\n')

# Replace 'category' column with dummy variables
dummies = pd.get_dummies(data.category, prefix = 'category') #create dummy vars
# Replace 'category' column with dummy variable columns
data_with_dummies = data.drop('category', axis = 1).join(dummies)
print(data_with_dummies)

[[ 1.    0.01]
 [ 2.   -0.01]
 [ 3.    0.25]
 [ 4.   -4.1 ]
 [ 5.    0.  ]] 

   x0    x1    y category
0   1  0.01 -1.5        a
1   2 -0.01  0.0        b
2   3  0.25  3.6        a
3   4 -4.10  1.3        a
4   5  0.00 -2.0        b 

   x0    x1    y  category_a  category_b
0   1  0.01 -1.5           1           0
1   2 -0.01  0.0           0           1
2   3  0.25  3.6           1           0
3   4 -4.10  1.3           1           0
4   5  0.00 -2.0           0           1


# Introduction to Patsy

In [5]:
import patsy

In [6]:
data = pd.DataFrame({
    'x0': [1, 2, 3, 4, 5],
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2.]})
print(data)

y, X = patsy.dmatrices('y ~ x0 + x1', data)
y

   x0    x1    y
0   1  0.01 -1.5
1   2 -0.01  0.0
2   3  0.25  3.6
3   4 -4.10  1.3
4   5  0.00 -2.0


DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [7]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [8]:
print(np.asarray(y), '\n') # DesignMatrix are NumPy ndarrays with metadata
print(np.asarray(X), '\n')
patsy.dmatrices('y ~ x0 + x1 + 0', data)[1] # Hide 'intercept' column

[[-1.5]
 [ 0. ]
 [ 3.6]
 [ 1.3]
 [-2. ]] 

[[ 1.    1.    0.01]
 [ 1.    2.   -0.01]
 [ 1.    3.    0.25]
 [ 1.    4.   -4.1 ]
 [ 1.    5.    0.  ]] 



DesignMatrix with shape (5, 2)
  x0     x1
   1   0.01
   2  -0.01
   3   0.25
   4  -4.10
   5   0.00
  Terms:
    'x0' (column 0)
    'x1' (column 1)

In [9]:
# Perform ordinary least Squares (OLS) regression
coef, resid, _, _ = np.linalg.lstsq(X, y, rcond = 1) 
print(coef, '\n')

# Reattach intercept column from metadata
coef = pd.Series(coef.squeeze(), index = X.design_info.column_names)
print(coef)

[[ 0.31290976]
 [-0.07910564]
 [-0.26546384]] 

Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64 



In [10]:
# Mix Python code with Patsy formulas
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
X

DesignMatrix with shape (5, 3)
  Intercept  x0  np.log(np.abs(x1) + 1)
          1   1                 0.00995
          1   2                 0.00995
          1   3                 0.22314
          1   4                 1.62924
          1   5                 0.00000
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'np.log(np.abs(x1) + 1)' (column 2)

In [11]:
# Standarzing and centering variable transformations
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
X

DesignMatrix with shape (5, 3)
  Intercept  standardize(x0)  center(x1)
          1         -1.41421        0.78
          1         -0.70711        0.76
          1          0.00000        1.02
          1          0.70711       -3.33
          1          1.41421        0.77
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)

In [12]:
new_data = pd.DataFrame({
    'x0': [6, 7, 8, 9],
    'x1': [3.1, -0.5, 0, 2.3],
    'y': [1, 2, 3, 4]})
print(new_data)
new_X = patsy.build_design_matrices([X.design_info], new_data)
new_X

   x0   x1  y
0   6  3.1  1
1   7 -0.5  2
2   8  0.0  3
3   9  2.3  4 



[DesignMatrix with shape (4, 3)
   Intercept  standardize(x0)  center(x1)
           1          2.12132        3.87
           1          2.82843        0.27
           1          3.53553        0.77
           1          4.24264        3.07
   Terms:
     'Intercept' (column 0)
     'standardize(x0)' (column 1)
     'center(x1)' (column 2)]

In [13]:
# Use '()' when you need to add columns from a dataset by name
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data) 
X

DesignMatrix with shape (5, 2)
  Intercept  I(x0 + x1)
          1        1.01
          1        1.99
          1        3.25
          1       -0.10
          1        5.00
  Terms:
    'Intercept' (column 0)
    'I(x0 + x1)' (column 1)

In [14]:
# Creating Dummy variables, i.e. transforming categorical data using Patsy

# transform out-of-sample data using inform from previous in-sample dataset
data = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],
    'key2': [0, 1, 0, 1, 0, 1, 0, 0],
    'v1': [1, 2, 3, 4, 5, 6, 7, 8],
    'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})
print(data)
y, X = patsy.dmatrices('v2 ~ key1', data)
X

  key1  key2  v1   v2
0    a     0   1 -1.0
1    a     1   2  0.0
2    b     0   3  2.5
3    b     1   4 -0.5
4    a     0   5  4.0
5    b     1   6 -1.2
6    a     0   7  0.2
7    b     0   8 -1.7 



DesignMatrix with shape (8, 2)
  Intercept  key1[T.b]
          1          0
          1          0
          1          1
          1          1
          1          0
          1          1
          1          0
          1          1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)

In [15]:
# If there is an intercept, one of the levels is left out to avoid collinearity
y, X = patsy.dmatrices('v2 ~ key1', data)
X

DesignMatrix with shape (8, 2)
  Intercept  key1[T.b]
          1          0
          1          0
          1          1
          1          1
          1          0
          1          1
          1          0
          1          1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)

In [16]:
# if you omit intercept, then columns for each category value will be included
y, X = patsy.dmatrices('v2 ~ key1 + 0', data)
X

DesignMatrix with shape (8, 2)
  key1[a]  key1[b]
        1        0
        1        0
        0        1
        0        1
        1        0
        0        1
        1        0
        0        1
  Terms:
    'key1' (columns 0:2)

In [17]:
# Numeric columns can be interpreted as categorical using the 'C()' function
y, X = patsy.dmatrices('v2 ~ C(key2)', data)
X

DesignMatrix with shape (8, 2)
  Intercept  C(key2)[T.1]
          1             0
          1             1
          1             0
          1             1
          1             0
          1             1
          1             0
          1             0
  Terms:
    'Intercept' (column 0)
    'C(key2)' (column 1)

In [18]:
# include interaction terms in analysis of variance (ANOVA) models
data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})
data

Unnamed: 0,key1,key2,v1,v2
0,a,zero,1,-1.0
1,a,one,2,0.0
2,b,zero,3,2.5
3,b,one,4,-0.5
4,a,zero,5,4.0
5,b,one,6,-1.2
6,a,zero,7,0.2
7,b,zero,8,-1.7


In [19]:
y, X = patsy.dmatrices('v2 ~ key1 + key2', data)
X

DesignMatrix with shape (8, 3)
  Intercept  key1[T.b]  key2[T.zero]
          1          0             1
          1          0             0
          1          1             1
          1          1             0
          1          0             1
          1          1             0
          1          0             1
          1          1             1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)

In [20]:
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)
X

DesignMatrix with shape (8, 4)
  Intercept  key1[T.b]  key2[T.zero]  key1[T.b]:key2[T.zero]
          1          0             1                       0
          1          0             0                       0
          1          1             1                       1
          1          1             0                       0
          1          0             1                       0
          1          1             0                       0
          1          0             1                       0
          1          1             1                       1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)
    'key1:key2' (column 3)

## Introduction to statsmodels

In [21]:
import statsmodels.api as sm
import statsmodels.formula as smf

In [22]:
def dnorm(mean: int, variance: float, size: tuple = 1) -> np.ndarray:
    if isinstance(size, int):
        size = size,    
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [23]:
# Create a linear model using random data
np.random.seed(12345)

N = 100
X = np.c_[dnorm(0, 0.4, size=N), dnorm(0, 0.6, size=N), dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps
print(X[:5], '\n')
print(y[:5], '\n')

# Add an intercept column to a matrix with 'sm.add_constant()'
X_model = sm.add_constant(X)
print(X_model[:5], '\n')

# Perform Ordinary Least Square(OLS) linear regression using the 'sm.OLS' class
model = sm.OLS(y, X)
results = model.fit()
print(results.params)
results.summary()  # Output diagnostic information

[[-0.12946849 -1.21275292  0.50422488]
 [ 0.30291036 -0.43574176 -0.25417986]
 [-0.32852189 -0.02530153  0.13835097]
 [-0.35147471 -0.71960511 -0.25821463]
 [ 1.2432688  -0.37379916 -0.52262905]] 

[ 0.42786349 -0.67348041 -0.09087764 -0.48949442 -0.12894109] 

[[ 1.         -0.12946849 -1.21275292  0.50422488]
 [ 1.          0.30291036 -0.43574176 -0.25417986]
 [ 1.         -0.32852189 -0.02530153  0.13835097]
 [ 1.         -0.35147471 -0.71960511 -0.25821463]
 [ 1.          1.2432688  -0.37379916 -0.52262905]] 

[0.17826108 0.22303962 0.50095093]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.43
Model:,OLS,Adj. R-squared (uncentered):,0.413
Method:,Least Squares,F-statistic:,24.42
Date:,"Fri, 20 Dec 2019",Prob (F-statistic):,7.44e-12
Time:,15:47:55,Log-Likelihood:,-34.305
No. Observations:,100,AIC:,74.61
Df Residuals:,97,BIC:,82.42
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1783,0.053,3.364,0.001,0.073,0.283
x2,0.2230,0.046,4.818,0.000,0.131,0.315
x3,0.5010,0.080,6.237,0.000,0.342,0.660

0,1,2,3
Omnibus:,4.662,Durbin-Watson:,2.201
Prob(Omnibus):,0.097,Jarque-Bera (JB):,4.098
Skew:,0.481,Prob(JB):,0.129
Kurtosis:,3.243,Cond. No.,1.74


In [24]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data['y'] = y
print(data[:5], '\n')

       col0      col1      col2         y
0 -0.129468 -1.212753  0.504225  0.427863
1  0.302910 -0.435742 -0.254180 -0.673480
2 -0.328522 -0.025302  0.138351 -0.090878
3 -0.351475 -0.719605 -0.258215 -0.489494
4  1.243269 -0.373799 -0.522629 -0.128941 



In [25]:
try:
    results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()
    results.params
    results.tvalues
    results.predict(data[:5])
except AttributeError:
    print('Error due to Pandas removal of "ols". Fix error in Further Study')

Error due to Pandas removal of "ols". Fix error in Further Study


In [26]:
# Estimating Time Series Processes
import random
init_x = 4
values = [init_x, init_x]
N = 1000
b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)

for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)
    
MAXLAGS = 5
model = sm.tsa.AR(values)  # AR = Auto-Regressive multivariate model
results = model.fit(MAXLAGS) # Lags = 2 however choose larger number if unknown
results.params # Format [intercept1, lag1, lag2, intercept2, lag1, lag2]

array([-0.00616093,  0.78446347, -0.40847891, -0.01364148,  0.01496872,
        0.01429462])

## Introduction to scikit-learn

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

In [28]:
train = pd.read_csv('datasets/titanic/train.csv') # Load Training dataset
test = pd.read_csv('datasets/titanic/test.csv')   # Load Test dataset
train[:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [29]:
print(train.isnull().sum(), '\n') # Look for missing data
print(test.isnull().sum(), '\n') # Look for missing data

# Fill in missing data for 'Age' column
impute_value = train['Age'].median() # Compute median age from dataset
train['Age'] = train['Age'].fillna(impute_value) # replace missing data
test['Age'] = test['Age'].fillna(impute_value)   # with computed value

# Convert 'Sex' column string type to new column 'isFemale' binary type
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

predicators = ['Pclass', 'IsFemale', 'Age'] # Input variables of interest
X_train = train[predicators].values # Create NumPy arrays: train
X_test = test[predicators].values   # Create NumPy arrays: train
y_train = train['Survived'].values  # Output variable
print(X_train[:5], '\n')
print(y_train[:5])

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64 

[[ 3.  0. 22.]
 [ 1.  1. 38.]
 [ 3.  1. 26.]
 [ 1.  1. 35.]
 [ 3.  0. 35.]] 

[0 1 1 1 0]


In [30]:
# create an instance of a Logistic Regression model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train) # fit data to the model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
y_predict = model.predict(X_test)
print(y_predict[:10])

[0 0 0 0 1 0 1 0 1 0]


In [32]:
# Cross Validation
model_cv = LogisticRegressionCV(10, cv = 3)
print(model_cv.fit(X_train, y_train), '\n')

# Perform cross validation manually
model = LogisticRegression(C = 10, solver='lbfgs')
scores = cross_val_score(model, X_train, y_train, cv = 4)
print(scores)

LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0) 

[0.77232143 0.80269058 0.77027027 0.78828829]
