# SOMALogic Proteomic Analysis – Application of Machine Learning for Prediction
Implementation of non-parametric machine learning models to establish proteomic and cellular markers predictive of increased refracture rate.

Author: Ethan Dinh, University of Oregon 

Date Created: December 13, 2022

---

# Importing Libraries

Loading in all of the necessary libraries to initiate the data analysis:
1. Pandas – Builds the dataframe for the data analysis. Allows for simple data alterations and filtering.
2. Sklearn – Used for the preprocessing of data for the machine learning models.
3. Matplotlib – Enables graphing

In [2]:
# Dataframe Libraries
import pandas as pd
import numpy as np

# Matplotlib & Seaborn – Graphical Visualization Library
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning Preprocessing Library
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif

# Filters out the warning messages for model training and evaluation
import warnings
warnings.filterwarnings('ignore')

# Importing Javascript Libraries
from IPython.display import Javascript

---

# Data Preprocessing
Includes organizing, filtering, processing, and renaming. 
* Directed to the filepath of the original data files: "Combined Data.xlsx & LK Clean.xlsx"
* Using pandas, created two datasets containing the original data
* In order to identify the treatment week, sample ID was split into week and sample number
* Unnecessary columns were then removed and columns were renamed to match their respective information


In [26]:
# Loading in the main data file
filePath = "/Users/ethandinh/Knight Campus/Projects/SOMA Data Analysis/SOMA Proteomic Panel Analysis/Data/ED-Combined-Transformed.csv"
somaData = pd.read_csv(filePath)

# Removing the Sample IDs from the dataset
df = somaData.drop(columns = "SampleId")

# Removing columns with NA values
df.dropna(axis = 0, inplace = True)

# Display the dataset
display(df)

Unnamed: 0,refracture,VDR-T1,NCF-2-T1,TERF1-T1,FOXM1-T1,FANCL-T1,CBS-T1,PIAS4-T1,IL-10 Ra-T1,STAT3-T1,...,THOC1-T5,CRKL-T5,XRCC4-T5,RBBP6-T5,AF1L1-T5,Tankyrase-1-T5,GABR2:CD-T5,TM59L-T5,CSMD2-T5,AMGO1:CD-T5
0,1,1784.9,821.3,842.0,730.9,811.2,2536.8,934.7,2481.8,8230.6,...,1878.9,3206.4,700.7,3439.1,401.5,1402.3,603.2,2527.8,7092.4,1743.6
1,0,1534.5,775.6,871.8,859.1,774.6,2157.7,866.7,2307.6,11718.8,...,1550.4,3557.9,835.7,3791.0,333.3,1189.1,686.6,2670.1,7601.2,1443.3
2,0,1465.4,954.9,872.1,809.8,824.4,2507.2,920.0,2626.2,9366.2,...,1934.4,2608.9,570.3,3181.2,409.8,1339.7,292.3,1965.2,7586.3,6748.4
3,1,1638.6,961.8,881.6,896.5,866.7,2205.2,771.0,2732.3,15796.1,...,1820.7,7838.2,941.7,3524.3,380.9,1405.8,626.1,2569.9,6126.3,1546.3
4,1,1696.0,912.3,877.9,850.0,864.7,2373.4,973.6,2702.0,10825.3,...,1754.3,4159.6,1106.6,3840.4,346.7,1294.5,758.2,2771.7,7443.5,1293.9
5,0,1435.7,878.0,873.8,856.7,845.1,2153.2,813.7,2813.7,13575.0,...,1723.1,6024.7,975.6,3887.3,303.9,1299.7,628.2,2801.1,7309.4,3057.8
10,0,1542.2,1079.1,937.4,896.5,904.5,1992.1,891.3,2296.2,13622.4,...,1619.8,1918.0,674.0,3695.2,315.2,1251.6,587.5,2604.5,5768.0,2045.8
11,0,1791.8,778.1,814.9,802.0,796.7,2336.5,968.7,2781.2,8581.4,...,1711.6,6409.7,713.8,3748.3,319.2,1295.2,620.0,2668.2,7880.9,1492.6
12,0,1720.7,888.7,819.9,1039.8,814.9,2192.3,892.6,2755.0,14620.8,...,1635.0,4518.0,1245.8,3664.8,332.0,1286.5,634.1,2678.7,7025.3,990.9
13,0,1414.0,949.5,1984.7,652.3,564.0,2098.8,606.0,2552.3,10834.2,...,1667.9,3645.7,1120.3,3840.3,298.0,1287.0,735.4,2720.6,6046.4,1653.2


In [60]:
from typing import List

def find_cols(df, name) -> List[str]:
    pure_names = []
    column_names = list(df.columns)
    for i in range(len(column_names)):
        if (name in column_names[i][:-3]):
            pure_names.append(column_names[i])
    return pure_names

def split_list(name_list) -> List[List[str]]:
    counter = 0 
    index = 0
    while True:
        if name_list[index][-2:] == "T1":
            counter += 1
        else: break
        index += 1

    print(f"There are {counter} types of {name_list[0][-2]} Markers.")

    output = {}
    for i in range(counter):
        curr_name = name_list[i][:-3]
        for j in range(len(name_list)):
            if (curr_name == name_list[j][:-3]):
                if curr_name not in output:
                    output[curr_name] = [name_list[j]]
                else:
                    output[curr_name].append(name_list[j])
        output[curr_name].append("refracture")
    return output

In [76]:
IL_10 = find_cols(df, "IL-10")
IL_10_split = split_list(IL_10)
print(IL_10_split)

There are 5 types of T Markers.
{'IL-10 Ra': ['IL-10 Ra-T1', 'IL-10 Ra-T2', 'IL-10 Ra-T3', 'IL-10 Ra-T4', 'IL-10 Ra-T5', 'refracture'], 'IL-10 Rb': ['IL-10 Rb-T1', 'IL-10 Rb-T2', 'IL-10 Rb-T3', 'IL-10 Rb-T4', 'IL-10 Rb-T5', 'refracture'], 'IL-10': ['IL-10-T1', 'IL-10-T2', 'IL-10-T3', 'IL-10-T4', 'IL-10-T5', 'refracture'], 'IL-10 Ra:ECD': ['IL-10 Ra:ECD-T1', 'IL-10 Ra:ECD-T2', 'IL-10 Ra:ECD-T3', 'IL-10 Ra:ECD-T4', 'IL-10 Ra:ECD-T5', 'refracture'], 'IL-10 Ra:CD': ['IL-10 Ra:CD-T1', 'IL-10 Ra:CD-T2', 'IL-10 Ra:CD-T3', 'IL-10 Ra:CD-T4', 'IL-10 Ra:CD-T5', 'refracture']}


In [84]:
IL_10_DF = df[IL_10_split["IL-10"]]

# Normalization
from sklearn.preprocessing import StandardScaler

feats = list(IL_10_DF.columns)[:-1]
IL_10_DF[feats] = StandardScaler().fit_transform(IL_10_DF[feats])

In [82]:
import plotly.express as px

fig = px.parallel_coordinates(IL_10_DF, color = "refracture", color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=0.5)
fig.show()