## Retrieve data from Anvien database

In [1]:
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.dialects.mysql import insert
import os
import pandas as pd

In [2]:
load_dotenv()
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASS')
db_port = 10184
database = "defaultdb"
host_url = rf"mysql://{db_user}:{db_password}@{db_host}:{db_port}/{database}"

engine = create_engine(host_url)

In [56]:
all_tables = pd.read_sql('SHOW TABLES', engine)
all_tables = all_tables[all_tables['Tables_in_defaultdb'] != 'annotations']
data_tables = all_tables['Tables_in_defaultdb'].to_list()
DD_tables = [table for table in data_tables if 'CT' in table]
LD_tables = [table for table in data_tables if 'ZT' in table]
data_tables

['GSM4768020_CT02_20190528_AR05',
 'GSM4768021_CT02_20190528_AR06',
 'GSM4768022_CT02_20190528_AR07',
 'GSM4768023_CT02_20190528_AR08',
 'GSM4768024_CT02_20190702_AR13',
 'GSM4768025_CT02_20190702_AR14',
 'GSM4768026_CT02_20190702_AR15',
 'GSM4768027_CT02_20190702_AR16',
 'GSM4768028_CT06_20190710_AR01',
 'GSM4768029_CT06_20190710_AR02',
 'GSM4768030_CT06_20190710_AR03',
 'GSM4768031_CT06_20190710_AR04',
 'GSM4768032_CT06_20190719_AR17',
 'GSM4768033_CT06_20190719_AR18',
 'GSM4768034_CT06_20190814_AR19',
 'GSM4768035_CT06_20190814_AR20',
 'GSM4768036_CT10_20190524_AR03',
 'GSM4768037_CT10_20190524_AR04',
 'GSM4768038_CT10_20190610_AR01',
 'GSM4768039_CT10_20190610_AR02',
 'GSM4768040_CT10_20190704_AR04',
 'GSM4768041_CT10_20190704_AR05',
 'GSM4768042_CT10_20190704_AR06',
 'GSM4768043_CT10_20190704_AR07',
 'GSM4768044_CT14_20190001_AR01',
 'GSM4768045_CT14_20190001_AR02',
 'GSM4768046_CT14_20190001_AR03',
 'GSM4768047_CT14_20190001_AR04',
 'GSM4768048_CT14_20190702_AR17',
 'GSM4768049_C

In [77]:
gene_list = ['amon', 'brp', 'cac', 'Cadps', 'Fife', 'Liprin-alpha', 'Pdf']

gene_list_str = ', '.join(f"'{gene}'" for gene in gene_list)
dfs = []
for table in DD_tables:
    query = f''' SELECT * FROM {table}
                 WHERE gene in ({gene_list_str}) '''
    new_df = pd.read_sql(query, engine, index_col='gene')
    dfs.append(new_df.T)
    
full_df = pd.concat(dfs)
full_df

gene,Cadps,Fife,Liprin-alpha,Pdf,amon,brp,cac
20190528_CLK856_DD_CT02_AR05_ACAGAC,6,6,4,59,35,8,20
20190528_CLK856_DD_CT02_AR05_ACAGGA,6,14,3,50,6,8,7
20190528_CLK856_DD_CT02_AR05_ACAGTG,1,0,0,3,1,0,1
20190528_CLK856_DD_CT02_AR05_ACCAAC,0,0,0,1,2,0,0
20190528_CLK856_DD_CT02_AR05_ACCAGA,0,0,0,4,0,0,0
...,...,...,...,...,...,...,...
20190704_CLK856_DD_CT22_AR14_TGGTGA,3,4,0,3,7,1,2
20190704_CLK856_DD_CT22_AR14_TGGTTG,0,0,0,0,0,0,0
20190704_CLK856_DD_CT22_AR14_TGTCAC,0,0,0,0,0,0,0
20190704_CLK856_DD_CT22_AR14_TGTCGA,0,0,0,0,0,0,0
