In [20]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import cross_val_score
df = pd.read_csv('/Users/devonbancroft/Desktop/Devon-GA-DAT-10-14/Data/housing.csv')

In [6]:
X = df.iloc[:,:-1]
y = df.PRICE

In [10]:
X = (X-X.mean())/X.std()

In [11]:
lasso = Lasso()

In [12]:
lasso.fit(X, y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
lasso.coef_

array([-0.        ,  0.        , -0.        ,  0.        , -0.        ,
        2.71542789, -0.        , -0.        , -0.        , -0.        ,
       -1.34428304,  0.18036988, -3.54677609])

In [17]:
alpha = 10

In [18]:
l1_term = alpha*np.sum(np.abs(lasso.coef_))

In [19]:
l1_term

77.86856908757778

In [27]:
alphas = np.logspace(-4, 4, 9)
cv_scores=[]
for alpha in alphas:
    lasso.set_params(alpha=alpha)
    scores = cross_val_score(estimator=lasso, X=X, y=y, cv=10)
    cv_scores.append((np.mean(scores), alpha))

In [28]:
cv_scores

[(0.20262012278999347, 0.0001),
 (0.20343535169466956, 0.001),
 (0.21144430759385185, 0.01),
 (0.2407890782467927, 0.1),
 (0.1807548507575551, 1.0),
 (-1.2860830508551744, 10.0),
 (-1.2860830508551744, 100.0),
 (-1.2860830508551744, 1000.0),
 (-1.2860830508551744, 10000.0)]

In [29]:
lasso.get_params()

{'alpha': 10000.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [42]:
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [43]:
rf.feature_importances_

array([3.84810513e-02, 7.93449531e-04, 5.63474586e-03, 3.71301337e-04,
       2.48264527e-02, 3.75670933e-01, 1.96180582e-02, 5.90977596e-02,
       6.40682174e-03, 1.77401066e-02, 1.47186284e-02, 1.51036238e-02,
       4.21537068e-01])

In [44]:
feats = pd.DataFrame({
    'Features': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

In [45]:
feats

Unnamed: 0,Features,Importance
12,LSTAT,0.421537
5,RM,0.375671
7,DIS,0.059098
0,CRIM,0.038481
4,NOX,0.024826
6,AGE,0.019618
9,TAX,0.01774
11,B,0.015104
10,PTRATIO,0.014719
8,RAD,0.006407


In [46]:
rf.score(X, y)

0.9763282492038284

In [48]:
feats.Importance.cumsum()

12    0.421537
5     0.797208
7     0.856306
0     0.894787
4     0.919613
6     0.939231
9     0.956971
11    0.972075
10    0.986794
8     0.993201
2     0.998835
1     0.999629
3     1.000000
Name: Importance, dtype: float64

Dendrogams

# Classification

In [49]:
def sigmoid(x):
    return 1 / (1+np.exp(-x))

In [89]:
sigmoid(-2.14)

0.10526938952250978

In [51]:
sigmoid(100)

1.0

In [53]:
sigmoid(-500)

7.124576406741285e-218

In [54]:
sigmoid(7)

0.9990889488055994

In [55]:
train = pd.read_csv('/Users/devonbancroft/Desktop/Devon-GA-DAT-10-14/Class16/train.csv')
test = pd.read_csv('/Users/devonbancroft/Desktop/Devon-GA-DAT-10-14/Class16/test (1).csv')

In [56]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
import seaborn as sns

In [73]:
X_train = train.loc[:,['Sex', 'Pclass']]
y_train = train.Survived
X_test = test.loc[:,['Sex', 'Pclass']]

In [74]:
X_train = pd.get_dummies(X_train, columns=['Sex', 'Pclass'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['Sex', 'Pclass'], drop_first=True)

In [75]:
X_train

Unnamed: 0,Sex_male,Pclass_2,Pclass_3
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,0
4,1,0,1
5,1,0,1
6,1,0,0
7,1,0,1
8,0,0,1
9,0,1,0


In [76]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [78]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
logreg.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [82]:
#left column is chance it will be 0, right is chance it will be 1
logreg.predict_proba(X_test)

array([[0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.75093089, 0.24906911],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.75093089, 0.24906911],
       [0.41462909, 0.58537091],
       [0.89503473, 0.10496527],
       [0.89503473, 0.10496527],
       [0.60521334, 0.39478666],
       [0.11295975, 0.88704025],
       [0.75093089, 0.24906911],
       [0.11295975, 0.88704025],
       [0.20028547, 0.79971453],
       [0.75093089, 0.24906911],
       [0.89503473, 0.10496527],
       [0.41462909, 0.58537091],
       [0.41462909, 0.58537091],
       [0.60521334, 0.39478666],
       [0.89503473, 0.10496527],
       [0.11295975, 0.88704025],
       [0.60521334, 0.39478666],
       [0.11295975, 0.88704025],
       [0.89503473, 0.10496527],
       [0.11295975, 0.88704025],
       [0.89503473, 0.10496527],
       [0.60521334, 0.39478666],
       [0.89503473, 0.10496527],
       [0.

In [83]:
#THe probability
logreg.predict_proba(X_test)[:,1]

array([0.10496527, 0.58537091, 0.24906911, 0.10496527, 0.58537091,
       0.10496527, 0.58537091, 0.24906911, 0.58537091, 0.10496527,
       0.10496527, 0.39478666, 0.88704025, 0.24906911, 0.88704025,
       0.79971453, 0.24906911, 0.10496527, 0.58537091, 0.58537091,
       0.39478666, 0.10496527, 0.88704025, 0.39478666, 0.88704025,
       0.10496527, 0.88704025, 0.10496527, 0.39478666, 0.10496527,
       0.24906911, 0.24906911, 0.58537091, 0.58537091, 0.39478666,
       0.10496527, 0.58537091, 0.58537091, 0.10496527, 0.10496527,
       0.10496527, 0.39478666, 0.10496527, 0.79971453, 0.88704025,
       0.10496527, 0.39478666, 0.10496527, 0.88704025, 0.58537091,
       0.39478666, 0.24906911, 0.79971453, 0.88704025, 0.24906911,
       0.10496527, 0.10496527, 0.10496527, 0.10496527, 0.88704025,
       0.10496527, 0.24906911, 0.10496527, 0.58537091, 0.39478666,
       0.79971453, 0.58537091, 0.39478666, 0.39478666, 0.88704025,
       0.58537091, 0.10496527, 0.58537091, 0.39478666, 0.88704

In [84]:
logreg.intercept_

array([2.06085883])

In [85]:
logreg.coef_

array([[-2.48809433, -0.67634771, -1.7159975 ]])

In [87]:
coeffs = pd.DataFrame({
    'variable': X_train.columns,
    'Weight': logreg.coef_[0]
})

In [88]:
coeffs

Unnamed: 0,variable,Weight
0,Sex_male,-2.488094
1,Pclass_2,-0.676348
2,Pclass_3,-1.715997


In [96]:
#This does y = m1 + m2x
output = X_train.dot(logreg.coef_.T)+logreg.intercept_

In [92]:
output.head()

Unnamed: 0,0
0,-2.143233
1,2.060859
2,0.344861
3,2.060859
4,-2.143233


In [93]:
sigmoid(-2.14)

0.10526938952250978

In [97]:
#This get the probability output (predict_proba)
sigmoid(output).head()

Unnamed: 0,0
0,0.104965
1,0.88704
2,0.585371
3,0.88704
4,0.104965


# Random Forest Classification

In [99]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [100]:
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [102]:
rf.feature_importances_

array([0.70526034, 0.04243283, 0.25230683])

In [104]:
importances = pd.DataFrame({
    'variable': X_train.columns,
    'Weight': rf.feature_importances_[0]
})
importances

Unnamed: 0,variable,Weight
0,Sex_male,0.70526
1,Pclass_2,0.70526
2,Pclass_3,0.70526


In [105]:
rf.predict_proba(X_test)

array([[0.86281851, 0.13718149],
       [0.50774207, 0.49225793],
       [0.83529315, 0.16470685],
       [0.86281851, 0.13718149],
       [0.50774207, 0.49225793],
       [0.86281851, 0.13718149],
       [0.50774207, 0.49225793],
       [0.83529315, 0.16470685],
       [0.50774207, 0.49225793],
       [0.86281851, 0.13718149],
       [0.86281851, 0.13718149],
       [0.62774288, 0.37225712],
       [0.02869862, 0.97130138],
       [0.83529315, 0.16470685],
       [0.02869862, 0.97130138],
       [0.08516805, 0.91483195],
       [0.83529315, 0.16470685],
       [0.86281851, 0.13718149],
       [0.50774207, 0.49225793],
       [0.50774207, 0.49225793],
       [0.62774288, 0.37225712],
       [0.86281851, 0.13718149],
       [0.02869862, 0.97130138],
       [0.62774288, 0.37225712],
       [0.02869862, 0.97130138],
       [0.86281851, 0.13718149],
       [0.02869862, 0.97130138],
       [0.86281851, 0.13718149],
       [0.62774288, 0.37225712],
       [0.86281851, 0.13718149],
       [0.