In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (VarianceThreshold,SelectFdr)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import (LassoCV, LassoCV, RidgeCV, RidgeClassifier, RidgeClassifierCV, ElasticNet, ElasticNetCV,
                                  RandomizedLogisticRegression)
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('/Users/lina/Documents/LungMAP/E16.5/Scott_customized_mask_features.csv')     # polygen training set
X = df.ix[:, :-3]
y = df.ix[:, -3].astype('int')

In [4]:
alpha = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
ridge_params = {'alpha': alpha}

In [5]:
clf = RidgeClassifierCV(alpha,cv=5,fit_intercept=True)
pipe = Pipeline([
  ('standard_scalar', StandardScaler()),
  ('feature_selection', SelectFdr()),
  ('classification', clf)
])

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    pipe.fit(X, y)

In [6]:
def prob_mnlr(X_train, y_train, X_pred, Coef):
    
    """ Calculate posterior probability using a multinomial logistic regression.
    X_pred: a matrix of data (shape [n_subregions, m_features]) to be categorized
    Coef: a matrix of regression coefficients (shape [K_classes, m_features])
    Return predicted probability for each class"""
    
    X_scaler = StandardScaler()
    X_train1 = X_scaler.fit_transform(X_train)
    X_pred1 = X_scaler.transform(X_pred)    
    X_sf = SelectFdr()
    X_train2 = X_sf.fit_transform(X_train1,y_train)
    X_pred2 = X_sf.transform(X_pred1)
    
    Nominator = np.exp(X_pred2 @ Coef.T)
    Denominator = np.sum(Nominator,axis=1)
    Out = Nominator / Denominator[:,None]
    Out = pd.DataFrame(Out)
    Out.columns = ['background','blood_vessel','bronchiole','distal_acinar_tubule','proximal_acinar_tubule']    
    return Out

### Test it out in 10 randomly selected sub-regions

In [7]:
np.random.seed(110)
idx = np.random.choice(len(y),10,replace=False)
x1 = X.iloc[idx]
y1 = y[idx]

In [8]:
y1

134    3
387    5
116    3
138    3
286    4
223    4
333    4
184    4
201    4
376    5
Name: class_id, dtype: int64

In [9]:
prob_ridge = prob_mnlr(X_train=X, y_train=y, X_pred=x1, Coef=pipe._final_estimator.coef_)
prob_ridge



Unnamed: 0,background,blood_vessel,bronchiole,distal_acinar_tubule,proximal_acinar_tubule
0,0.072749,0.103584,0.705041,0.03682,0.081806
1,0.179161,0.161631,0.146647,0.070436,0.442125
2,0.105217,0.174338,0.588601,0.047043,0.0848
3,0.194181,0.174678,0.401774,0.067347,0.162019
4,0.136237,0.200694,0.158248,0.381871,0.122949
5,0.180386,0.160397,0.173324,0.407464,0.078429
6,0.092644,0.149965,0.127839,0.505079,0.124473
7,0.125799,0.150763,0.138197,0.467942,0.117299
8,0.115664,0.148168,0.132254,0.447741,0.156174
9,0.141792,0.128423,0.129607,0.039663,0.560515


In [10]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    y_pred = pipe.predict(x1)

In [11]:
y_pred

array([3, 5, 3, 3, 4, 4, 4, 4, 4, 5])