# Logistic Regression

In [1]:
import matplotlib.pyplot as plt
import ipywidgets as widgets
from sklearn.svm import SVC
import sklearn as sk
import seaborn as sns 
import pandas as pd
import numpy as np
import warnings

In [2]:
#given a dataset it returns a list will all the values of the targets (hypo = 0 and Norm = 0)
def labels(df):
    target = []
    for c in df.columns:
        if "Hypoxia" in c.replace('\"', '').split("_") or "Hypo" in c.split("_"):
            target.append(0.0)
        elif "Normoxia" in c.replace('\"', '').split("_") or "Norm" in c.split("_"):
            target.append(1.0)
        else:
            
            print(c)
            raise ValueError("Cell cannot be categorized")
    return target
        

In [3]:
#-----SmartSeq-----#
HCC_s_Train = pd.read_csv("processed_data\HCC1806_SmartS_Filtered_Standardized-Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)
MCF_s_Train = pd.read_csv("processed_data\MCF7_SmartS_Filtered_Standardized-Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)
HCC_s_Test = pd.read_csv("processed_data\HCC1806_SmartS_Filtered_Standardized-Normalised_3000_Data_test.txt", delimiter="\ ",engine='python',index_col=0)
MCF_s_Test = pd.read_csv("processed_data\MCF7_SmartS_Filtered_Standardized-Normalised_3000_Data_test.txt", delimiter="\ ",engine='python',index_col=0)

#-----DropSeq-----#
HCC_d = pd.read_csv("raw_data_DropSeq\HCC1806_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)
MCF_d = pd.read_csv("raw_data_DropSeq\MCF7_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def logi_model(df, y):
    pipe = make_pipeline(StandardScaler(), LogisticRegression())
    model = pipe.fit(df.T, y)
    return model

#-----SmartSeq-----#
HCC_s_logi_model = logi_model(HCC_s_Train, labels(HCC_s_Train))
MCF_s_logi_model = logi_model(MCF_s_Train, labels(MCF_s_Train))

#-----DropSeq-----#
HCC_d_train, HCC_d_test, y_HCC_d_train, y_HCC_d_test = train_test_split(HCC_d.T, labels(HCC_d))
MCF_d_train, MCF_d_test, y_MCF_d_train, y_MCF_d_test = train_test_split(MCF_d.T, labels(MCF_d))
HCC_d_logi_model = logi_model(HCC_d_train.T, y_HCC_d_train)
MCF_d_logi_model = logi_model(MCF_d_train.T, y_MCF_d_train)


# Perfromance Measures

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
warnings.simplefilter('ignore')

def cv_score(df, model):
    scaler = StandardScaler()
    return cross_val_score(model, scaler.fit_transform(df.T), labels(df), cv=KFold(shuffle=True, n_splits=5), scoring="accuracy")

#-----SmartSeq-----#
print("Cross validation score: ", cv_score(HCC_s_Train, HCC_s_logi_model))
print("Cross validation score: ", cv_score(MCF_s_Train, MCF_s_logi_model))

#-----DropSeq-----#
print("Cross validation score: ", cv_score(HCC_d_train.T, HCC_d_logi_model))
print("Cross validation score: ", cv_score(MCF_d_train.T, MCF_d_logi_model))

Cross validation score:  [0.97297297 1.         0.94444444 0.91666667 0.94444444]
Cross validation score:  [1. 1. 1. 1. 1.]
Cross validation score:  [0.93508852 0.92506812 0.9191644  0.92098093 0.92506812]
Cross validation score:  [0.96393342 0.97102343 0.96023428 0.96146732 0.96546408]


In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def pred_accuracy(df, model, title=""):
    print(title)
    print("Score: ", model.score(df.T, y=labels(df)))
    #print("Accuracy Score: ", accuracy_score(labels(df),model.predict(df.T)))
    print("Confusion matrix: \n", confusion_matrix(model.predict(df.T), labels(df)), "\n")

#-----SmartSeq-----#
pred_accuracy(HCC_s_Test, HCC_s_logi_model, "SmartSeq HCC:")
pred_accuracy(MCF_s_Test, MCF_s_logi_model, "SmartSeq MCF:")

#-----DropSeq-----#
pred_accuracy(HCC_d_test.T, HCC_d_logi_model, "DropSeq HCC:")
pred_accuracy(MCF_d_test.T, MCF_d_logi_model, "DropSeq MCF:")


SmartSeq HCC:
Score:  0.9555555555555556
Accuracy Score:  0.9555555555555556
confusion_matrix: 
 [[18  2]
 [ 0 25]] 

SmartSeq MCF:
Score:  0.9838709677419355
Accuracy Score:  0.9838709677419355
confusion_matrix: 
 [[29  1]
 [ 0 32]] 

DropSeq HCC:
Score:  0.9245437210569327
Accuracy Score:  0.9245437210569327
confusion_matrix: 
 [[2106  109]
 [ 168 1288]] 

DropSeq MCF:
Score:  0.9644904753097836
Accuracy Score:  0.9644904753097836
confusion_matrix: 
 [[2125   97]
 [  95 3090]] 

