# This notebook handles the data treatment for the dataset found at:
https://github.com/LumosBio/MolData/blob/main/Data/all_molecular_data.zip


# Data ingestion


In [None]:
# Basic imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
# Build SparkSession
spark = SparkSession.builder.appName("JoinWindowingSQL").getOrCreate()

In [3]:
data_dir ='../Datasets/'
mol_data1 = data_dir + 'aid_disease_mapping.csv'
mol_data2 = data_dir + 'aid_target_mapping.csv'

In [5]:
! head $mol_data1

AID,cancer,nervous System,immune system,cardiovascular,toxicity,obesity,virus,diabetes,metabolic disorders,bacteria,parasite,epigenetics_genetics,pulmonary,infection,aging,fungal
activity_1554,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
activity_2732,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
activity_1085,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
activity_1236,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
activity_1274,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
activity_781,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
activity_422,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
activity_1224905,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
activity_624256,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
df_mol1 = spark.read.csv(mol_data1, header=True, sep=',', inferSchema=True)
df_mol2 = spark.read.csv(mol_data2, header=True, sep=',', inferSchema=True)

In [8]:
df_mol1.show(10)
df_mol2.show(10)

+----------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+
|             AID|cancer|nervous System|immune system|cardiovascular|toxicity|obesity|virus|diabetes|metabolic disorders|bacteria|parasite|epigenetics_genetics|pulmonary|infection|aging|fungal|
+----------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+
|   activity_1554|     1|             0|            0|             0|       0|      0|    0|       0|                  0|       0|       0|                   0|        0|        0|    0|     0|
|   activity_2732|     0|             1|            1|             0|       0|      1|    0|       1|                  0|       0|       0|                   0|        0|        0|    0|     0|
|   activity_1085|     1|     

In [16]:
df_mol = df_mol1.join(df_mol2, on='AID', how='inner')

In [17]:
df_mol.show(15)

+----------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+-----------------+--------------+----------------+---------+--------+--------------------+------+--------------------+-----------+-----------+--------------+-----------+------+-----------+
|             AID|cancer|nervous System|immune system|cardiovascular|toxicity|obesity|virus|diabetes|metabolic disorders|bacteria|parasite|epigenetics_genetics|pulmonary|infection|aging|fungal|Membrane receptor|Enzyme (other)|Nuclear receptor|Hydrolase|Protease|Transcription factor|Kinase|Epigenetic regulator|Ion channel|Transferase|Oxidoreductase|Transporter|NTPase|Phosphatase|
+----------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+-----------------+--------------+----------

In [26]:
df_mol = df_mol.withColumn("ID", F.regexp_replace("AID", "activity_", ""))
df_mol = df_mol.withColumn("ID", F.col("ID").cast("int")).orderBy("ID")


In [27]:
df_mol.show(15)

+------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+-----------------+--------------+----------------+---------+--------+--------------------+------+--------------------+-----------+-----------+--------------+-----------+------+-----------+---+
|         AID|cancer|nervous System|immune system|cardiovascular|toxicity|obesity|virus|diabetes|metabolic disorders|bacteria|parasite|epigenetics_genetics|pulmonary|infection|aging|fungal|Membrane receptor|Enzyme (other)|Nuclear receptor|Hydrolase|Protease|Transcription factor|Kinase|Epigenetic regulator|Ion channel|Transferase|Oxidoreductase|Transporter|NTPase|Phosphatase| ID|
+------------+------+--------------+-------------+--------------+--------+-------+-----+--------+-------------------+--------+--------+--------------------+---------+---------+-----+------+-----------------+--------------+--------------

In [28]:
print(f'df_mol - number of rows: {df_mol.count()      }')
df_mol.printSchema()

df_mol - number of rows: 600
root
 |-- AID: string (nullable = true)
 |-- cancer: integer (nullable = true)
 |-- nervous System: integer (nullable = true)
 |-- immune system: integer (nullable = true)
 |-- cardiovascular: integer (nullable = true)
 |-- toxicity: integer (nullable = true)
 |-- obesity: integer (nullable = true)
 |-- virus: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- metabolic disorders: integer (nullable = true)
 |-- bacteria: integer (nullable = true)
 |-- parasite: integer (nullable = true)
 |-- epigenetics_genetics: integer (nullable = true)
 |-- pulmonary: integer (nullable = true)
 |-- infection: integer (nullable = true)
 |-- aging: integer (nullable = true)
 |-- fungal: integer (nullable = true)
 |-- Membrane receptor: integer (nullable = true)
 |-- Enzyme (other): integer (nullable = true)
 |-- Nuclear receptor: integer (nullable = true)
 |-- Hydrolase: integer (nullable = true)
 |-- Protease: integer (nullable = true)
 |-- Transcript

In [30]:
print(f'df - number of rows is {df_mol.count()}; after dropDuplicates() applied would be {df_mol.dropDuplicates().count()}.')

df - number of rows is 600; after dropDuplicates() applied would be 600.


In [31]:
print(f'''df - number of rows after dropna(how='any') applied would be {df_mol.dropna(how='any').count()}.''')

df - number of rows after dropna(how='any') applied would be 600.
