In [1]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 
real_gwas_path = base_path / 'mice_data_set' / 'out' 
synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth_highassoc'
synthetic_gwas_path_lowassoc = base_path / 'mice_data_set' / 'out_synth_lowassoc'

In [2]:
PHENOTYPE = 'abBMD'

real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv') #, usecols=['snp', 'p']) # , usecols=['snp', 'p']
real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})
real_snps = real_snps[['index', 'snp', 'p']]
real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
real_snps

Unnamed: 0,index,snp,p,interest
0,1,rs29477109,5.052317e-14,True
1,2,rs27071351,7.074181e-14,True
2,3,rs27024162,7.170582e-14,True
3,4,rs49423067,7.198661e-14,True
4,5,rs29470802,8.049849e-14,True
...,...,...,...,...
79640,79641,rs3162358,9.998911e-01,False
79641,79642,rs50509099,9.999012e-01,False
79642,79643,rs47505090,9.999041e-01,False
79643,79644,rs232293770,9.999351e-01,False


In [3]:
synthetic_snps = pd.read_csv(synthetic_gwas_path / f'lm_{PHENOTYPE}_1_17.csv')  
synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})
synthetic_snps = synthetic_snps[['index', 'snp', 'p']]
synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
synthetic_snps

Unnamed: 0,index,snp,p,interest
0,1,rs49725879,5.032094e-47,True
1,2,rs49153109,5.653305e-45,True
2,3,rs6284806,1.9634690000000003e-43,True
3,4,rs27037853,3.768081e-43,True
4,5,rs27037855,8.046937e-43,True
5,6,rs50536616,9.357014e-43,True
6,7,rs240744127,1.512353e-41,True
7,8,rs6258876,4.196316e-28,True
8,9,rs27052855,6.2708140000000006e-27,True
9,10,rs29395706,2.131591e-26,True


In [4]:
combined = pd.merge(synthetic_snps, 
         real_snps, 
         how='inner', 
         on=['snp'],
         suffixes=['_synthetic', '_real'])
combined

Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,1,rs49725879,5.032094e-47,True,119,3.01468e-11,True
1,2,rs49153109,5.653305e-45,True,28,2.515312e-13,True
2,3,rs6284806,1.9634690000000003e-43,True,30,2.597932e-13,True
3,4,rs27037853,3.768081e-43,True,41,3.267574e-13,True
4,5,rs27037855,8.046937e-43,True,40,3.180409e-13,True
5,6,rs50536616,9.357014e-43,True,37,3.153627e-13,True
6,7,rs240744127,1.512353e-41,True,35,3.121702e-13,True
7,8,rs6258876,4.196316e-28,True,149,2.602931e-10,True
8,9,rs27052855,6.2708140000000006e-27,True,158,7.790997e-10,True
9,10,rs29395706,2.131591e-26,True,83,2.08605e-12,True


In [5]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

print(classification_report(combined['interest_real'], combined['interest_synthetic']))
print(f"f1_score is: {f1_score(combined['interest_real'], combined['interest_synthetic']):.04f}")


              precision    recall  f1-score   support

       False       0.00      0.00      0.00         2
        True       0.88      1.00      0.93        14

    accuracy                           0.88        16
   macro avg       0.44      0.50      0.47        16
weighted avg       0.77      0.88      0.82        16

f1_score is: 0.9333


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
ssl = pd.read_csv(synthetic_gwas_path_lowassoc / f'lm_{PHENOTYPE}_1_17.csv')  
ssl = ssl.rename(columns={ssl.columns[0]: 'index'})
ssl = ssl[['index', 'snp', 'p']]
ssl['interest'] = ssl['p'].apply(lambda x: True if x <= 1e-8 else False)
ssl

Unnamed: 0,index,snp,p,interest
0,1,cfw-13-24187180,0.03804,False
1,2,rs29880636,0.04583,False
2,3,rs257562009,0.433289,False
3,4,rs29873068,0.747245,False
4,5,rs225065578,0.786949,False
5,6,rs29885281,0.867104,False
6,7,rs29883093,0.876417,False
7,8,rs232346591,0.908962,False
8,9,rs216070209,0.941498,False
9,10,rs29880124,0.961485,False


In [7]:
ssl_c = pd.merge(ssl, 
         real_snps, 
         how='inner', 
         on=['snp'],
         suffixes=['_synthetic', '_real'])

print(classification_report(ssl_c['interest_real'], ssl_c['interest_synthetic']))
print(f"f1_score is: {f1_score(ssl_c['interest_real'], ssl_c['interest_synthetic']):.04f}")



              precision    recall  f1-score   support

       False       1.00      1.00      1.00        16

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16

f1_score is: 0.0000


  average, "true nor predicted", 'F-score is', len(true_sum)


In [8]:
ssl_c

Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,1,cfw-13-24187180,0.03804,False,53983,0.667277,False
1,2,rs29880636,0.04583,False,61337,0.761604,False
2,3,rs257562009,0.433289,False,22745,0.271553,False
3,4,rs29873068,0.747245,False,71750,0.897957,False
4,5,rs225065578,0.786949,False,42102,0.511948,False
5,6,rs29885281,0.867104,False,78865,0.98981,False
6,7,rs29883093,0.876417,False,70430,0.880104,False
7,8,rs232346591,0.908962,False,79119,0.99309,False
8,9,rs216070209,0.941498,False,73114,0.915426,False
9,10,rs29880124,0.961485,False,77893,0.977462,False


In [10]:
highlow = pd.concat([combined, ssl_c])
highlow


Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,1,rs49725879,5.032094e-47,True,119,3.01468e-11,True
1,2,rs49153109,5.653305e-45,True,28,2.515312e-13,True
2,3,rs6284806,1.9634690000000003e-43,True,30,2.597932e-13,True
3,4,rs27037853,3.768081e-43,True,41,3.267574e-13,True
4,5,rs27037855,8.046937e-43,True,40,3.180409e-13,True
5,6,rs50536616,9.357014e-43,True,37,3.153627e-13,True
6,7,rs240744127,1.512353e-41,True,35,3.121702e-13,True
7,8,rs6258876,4.196316e-28,True,149,2.602931e-10,True
8,9,rs27052855,6.2708140000000006e-27,True,158,7.790997e-10,True
9,10,rs29395706,2.131591e-26,True,83,2.08605e-12,True


In [11]:
print(classification_report(highlow['interest_real'], highlow['interest_synthetic']))
print(f"f1_score is: {f1_score(highlow['interest_real'], highlow['interest_synthetic']):.04f}")



              precision    recall  f1-score   support

       False       1.00      0.89      0.94        18
        True       0.88      1.00      0.93        14

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.95      0.94      0.94        32

f1_score is: 0.9333


In [17]:
orig_path = base_path / 'mice_data_set' / 'out_alex'

PHENOTYPE = 'abBMD'

orig = pd.read_csv(orig_path / 'orig.csv') #, usecols=['snp', 'p']) # , usecols=['snp', 'p']
orig = orig.rename(columns={orig.columns[0]: 'index'})
orig = orig[['index', 'snp', 'p']]
orig['interest'] = orig['p'].apply(lambda x: True if x <= 1e-8 else False)
orig

syn = pd.read_csv(orig_path / 'syn.csv') #, usecols=['snp', 'p']) # , usecols=['snp', 'p']
syn = syn.rename(columns={syn.columns[0]: 'index'})
syn = syn[['index', 'snp', 'p']]
syn['interest'] = syn['p'].apply(lambda x: True if x <= 1e-8 else False)
syn

origsyn = pd.merge(syn, 
         orig, 
         how='inner', 
         on=['snp'],
         suffixes=['_synthetic', '_real'])
origsyn

Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,11,rs29477109,1.792518e-10,True,1,5.052317e-14,True
1,2,rs27037823,1.361446e-12,True,2,2.66696e-12,True
2,5,rs29435046,8.595568e-12,True,3,3.149144e-12,True
3,3,rs27037798,1.415458e-12,True,4,3.203628e-12,True
4,10,rs27070712,9.308289e-11,True,5,5.49426e-12,True
5,11,rs46637219,3.79485e-14,True,6,6.826306e-12,True
6,12,rs259190588,2.368376e-09,True,7,7.554476e-12,True
7,13,rs27045535,7.479687e-09,True,8,1.398552e-11,True
8,9,rs26992391,6.492795e-11,True,9,1.714527e-11,True
9,4,rs49725879,3.16934e-12,True,10,3.01468e-11,True


In [18]:
highlow = pd.concat([origsyn, ssl_c])
highlow

Unnamed: 0,index_synthetic,snp,p_synthetic,interest_synthetic,index_real,p_real,interest_real
0,11,rs29477109,1.792518e-10,True,1,5.052317e-14,True
1,2,rs27037823,1.361446e-12,True,2,2.66696e-12,True
2,5,rs29435046,8.595568e-12,True,3,3.149144e-12,True
3,3,rs27037798,1.415458e-12,True,4,3.203628e-12,True
4,10,rs27070712,9.308289e-11,True,5,5.49426e-12,True
5,11,rs46637219,3.79485e-14,True,6,6.826306e-12,True
6,12,rs259190588,2.368376e-09,True,7,7.554476e-12,True
7,13,rs27045535,7.479687e-09,True,8,1.398552e-11,True
8,9,rs26992391,6.492795e-11,True,9,1.714527e-11,True
9,4,rs49725879,3.16934e-12,True,10,3.01468e-11,True


In [21]:
print(classification_report(highlow['interest_real'], highlow['interest_synthetic']))
print(f"f1_score is: {f1_score(highlow['interest_real'], highlow['interest_synthetic']):.02f}")

              precision    recall  f1-score   support

       False       0.89      1.00      0.94        16
        True       1.00      0.87      0.93        15

    accuracy                           0.94        31
   macro avg       0.94      0.93      0.93        31
weighted avg       0.94      0.94      0.94        31

f1_score is: 0.93
