# Proteomics data clean up

### Import packages

In [None]:
import pandas as pd

### Load data

In [3]:
raw_data = pd.read_csv("20211937_Dobes_NPX_2022-06-07.csv", delimiter=";")
raw_data

Unnamed: 0,SampleID,Index,OlinkID,UniProt,Assay,MissingFreq,Panel,Panel_Lot_Nr,PlateID,QC_Warning,LOD,NPX,Normalization,Assay_Warning
0,FR20611027,1,OID20823,Q9BXS1,IDI2,0.3760,Neurology,B14809,20211937_SS220285,PASS,0.1167,1.1241,Intensity,PASS
1,FR20610806,2,OID20823,Q9BXS1,IDI2,0.3760,Neurology,B14809,20211937_SS220285,PASS,0.1167,0.4233,Intensity,PASS
2,10583381,3,OID20823,Q9BXS1,IDI2,0.3760,Neurology,B14809,20211937_SS220285,PASS,0.1167,0.0942,Intensity,PASS
3,10405058,4,OID20823,Q9BXS1,IDI2,0.3760,Neurology,B14809,20211937_SS220285,PASS,0.1167,-0.0012,Intensity,PASS
4,FR20584877,5,OID20823,Q9BXS1,IDI2,0.3760,Neurology,B14809,20211937_SS220285,PASS,0.1167,1.2412,Intensity,PASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6765573,FR26871801,91,OID31472,Q8WXX5,DNAJC9,0.0027,Oncology_II,B20705,20211937RR_SS220361,PASS,-1.0872,4.7886,Intensity,PASS
6765574,FR20584010,92,OID31472,Q8WXX5,DNAJC9,0.0027,Oncology_II,B20705,20211937RR_SS220361,PASS,-1.0872,6.2869,Intensity,PASS
6765575,FR20876747,93,OID31472,Q8WXX5,DNAJC9,0.0027,Oncology_II,B20705,20211937RR_SS220361,PASS,-1.0872,4.1082,Intensity,PASS
6765576,10481231,94,OID31472,Q8WXX5,DNAJC9,0.0027,Oncology_II,B20705,20211937RR_SS220361,PASS,-1.0872,4.8411,Intensity,PASS


### Column Names cleanup

In [4]:
raw_data.columns = raw_data.columns.str.lower()
raw_data.rename(
    columns={
        "sampleid": "sample_id",
        "olinkid": "olink_id",
        "assay": "gene_name",
        "missingfreq": "missing_freq",
        "plateid": "plate_id",
    },
    inplace=True,
)

In [5]:
for col in ["qc_warning", "assay_warning"]:
    print(raw_data[col].value_counts())

PASS           6603008
WARN            162203
MANUAL_WARN        367
Name: count, dtype: int64
PASS    6742957
WARN      22621
Name: count, dtype: int64


### Sample cleanup

Drop samples that contain QC warnings in Assay or QC columns.

In [7]:
processed_data = raw_data.loc[
    ~raw_data.qc_warning.str.contains("WARN") | ~(raw_data.assay_warning == "WARN")
]
processed_data.sample_id.value_counts()

sample_id
FR20611027                      2939
FR22635603                      2939
10448894                        2939
FR23821523                      2939
FR17997213                      2939
                                ... 
CONTROL_SAMPLE_US_CS_AS_2-71    1101
CONTROL_SAMPLE_US_CS_AS_2-72    1101
CONTROL_SAMPLE_US_CS_AS_2-21    1101
CONTROL_SAMPLE_US_CS_AS_2-84     736
CONTROL_SAMPLE_US_CS_AS_2-83     736
Name: count, Length: 2354, dtype: int64

Drop samples that contain the word "CONTROL" in the `sample_id` column.

In [10]:
processed_data = processed_data.loc[~processed_data.sample_id.str.contains("CONTROL")]
processed_data.sample_id.value_counts()

sample_id
FR20611027    2939
FR22635603    2939
FR26875717    2939
FR20871305    2939
FR20616154    2939
              ... 
FR20876683    2927
FR21171317    2927
10494878      2927
FR15489758    2924
FR21148245    2924
Name: count, Length: 2250, dtype: int64

Write output to parquet file.

In [11]:
processed_data.to_parquet(
    "proteomics_processed.parquet",
)

{'paths': ['s3://enveda-data-dx/proteomics_prod/sample_id=10379136/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399347/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399372/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399469/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399501/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399508/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399639/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399701/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data-dx/proteomics_prod/sample_id=10399785/54c3043a65884447a5f5c1b4b95bbec9.snappy.parquet',
  's3://enveda-data