# Export csv to aiven database

In [39]:
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import os
import pandas as pd

In [2]:
load_dotenv()
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASS')
db_port = 10184
database = "defaultdb"
host_url = rf"mysql://{db_user}:{db_password}@{db_host}:{db_port}/{database}"

engine = create_engine(host_url)

Now, read csv to pandas dataframe

In [9]:
annot_df = pd.read_csv('neuron_annotations.csv', index_col=0)
annot_df

Unnamed: 0,experiment,Repeats,condition,date,time,Idents
20181215_CLK856_LD_ZT14_AR07_ACAGGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181215_CLK856_LD_ZT14_AR07_CTTCTG,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR02_GTACCA,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR02_TCCTTC,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR08_AGACAG,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
...,...,...,...,...,...,...
20190710_CLK856_DD_CT06_AR04_GTCTTC,CLK856_DD,DD_1,DD,2019-07-10,CT06,1:DN1p_CNMa
20190710_CLK856_DD_CT06_AR04_TCCTTC,CLK856_DD,DD_1,DD,2019-07-10,CT06,1:DN1p_CNMa
20190719_CLK856_DD_CT06_AR18_GTCTTC,CLK856_DD,DD_2,DD,2019-07-19,CT06,1:DN1p_CNMa
20190814_CLK856_DD_CT06_AR19_TCTGCA,CLK856_DD,DD_2,DD,2019-08-14,CT06,1:DN1p_CNMa


First, create table

In [20]:
ddl = """CREATE TABLE "annotations" (
        "single_cell" VARCHAR(255) PRIMARY KEY,
        "experiment" VARCHAR(255),
        "Repeats" VARCHAR(255),
        "condition" VARCHAR(255),
        "date" DATE,
        "time" VARCHAR(255),
        "Idents" VARCHAR(255)
        )"""  
print(ddl)

with engine.connect() as connection:
    connection.execute(text(ddl))

CREATE TABLE "annotations" (
        "single_cell" VARCHAR(255) PRIMARY KEY,
        "experiment" VARCHAR(255),
        "Repeats" VARCHAR(255),
        "condition" VARCHAR(255),
        "date" DATE,
        "time" VARCHAR(255),
        "Idents" VARCHAR(255)
        )


In [21]:
# annot_df.to_sql(name='annotations',
#                 con=engine,
#                 index=True,
#                 if_exists='append',
#                 index_label='single_cell')

2615

In [3]:
df_result = pd.read_sql("SELECT * FROM annotations", con=engine, index_col='single_cell')
df_result

Unnamed: 0_level_0,experiment,Repeats,condition,date,time,Idents
single_cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20181215_CLK856_LD_ZT14_AR07_ACAGAC,CLK856_LD,LD_2,LD,2018-12-15,ZT14,1:DN1p_CNMa
20181215_CLK856_LD_ZT14_AR07_ACAGGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181215_CLK856_LD_ZT14_AR07_ACCAAC,CLK856_LD,LD_2,LD,2018-12-15,ZT14,15:DN1p_CNMa
20181215_CLK856_LD_ZT14_AR07_ACCAGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,8:LN_ITP
20181215_CLK856_LD_ZT14_AR07_ACCATG,CLK856_LD,LD_2,LD,2018-12-15,ZT14,3:DN1a
...,...,...,...,...,...,...
20190814_CLK856_DD_CT06_AR20_TCACCA,CLK856_DD,DD_2,DD,2019-08-14,CT06,7:DN1p
20190814_CLK856_DD_CT06_AR20_TCCTTC,CLK856_DD,DD_2,DD,2019-08-14,CT06,9:LNd_NPF
20190814_CLK856_DD_CT06_AR20_TGAGAC,CLK856_DD,DD_2,DD,2019-08-14,CT06,3:DN1a
20190814_CLK856_DD_CT06_AR20_TGAGGA,CLK856_DD,DD_2,DD,2019-08-14,CT06,14:DN3


In [4]:
# Use the raw SQL query to get all tables
query = "SHOW TABLES"
tables_df = pd.read_sql(query, con=engine)
tables_df

Unnamed: 0,Tables_in_defaultdb
0,annotations
1,mytest


In [5]:
query = f"DESCRIBE annotations"
columns_df = pd.read_sql(query, con=engine)
columns_df

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,single_cell,varchar(255),NO,PRI,,
1,experiment,varchar(255),YES,,,
2,Repeats,varchar(255),YES,,,
3,condition,varchar(255),YES,,,
4,date,date,YES,,,
5,time,varchar(255),YES,,,
6,Idents,varchar(255),YES,,,


## Load experimental gene expression data

First, gather data from all files

In [28]:
with open('DD_genes.txt', 'r') as f:
    genes_DD = f.read().splitlines()

with open('LD_genes.txt', 'r') as f:
    genes_LD = f.read().splitlines()

print(len(genes_LD))
print(len(genes_DD))

15743
15650


In [44]:
DATA_PATH = r'../dataset/'
mapper = lambda x : x[1:] # removes leading 'x' char in idx strings
list_DD = []
list_LD = []
file_names = os.listdir(DATA_PATH)

for filename in file_names:
    print(f"filename: {filename}")
    new_df = pd.read_csv(f"{DATA_PATH}/{filename}", 
                         index_col=0, 
                         # na_values=["", "NaN", "NULL"], 
                         keep_default_na=False)
    new_df = new_df.rename(mapper, axis='columns')
    new_df = new_df.T
    if 'ZT' in filename:
        list_LD.append(new_df)
    elif 'CT' in filename:
        list_DD.append(new_df)
    else:
        raise ValueError(f"File {filename} has no ZT or CT data")
    
df_LD = pd.concat(list_LD)
df_DD = pd.concat(list_DD)

filename: GSM4768020_CT02_20190528_AR05.csv
filename: GSM4768021_CT02_20190528_AR06.csv
filename: GSM4768022_CT02_20190528_AR07.csv
filename: GSM4768023_CT02_20190528_AR08.csv
filename: GSM4768024_CT02_20190702_AR13.csv
filename: GSM4768025_CT02_20190702_AR14.csv
filename: GSM4768026_CT02_20190702_AR15.csv
filename: GSM4768027_CT02_20190702_AR16.csv
filename: GSM4768028_CT06_20190710_AR01.csv
filename: GSM4768029_CT06_20190710_AR02.csv
filename: GSM4768030_CT06_20190710_AR03.csv
filename: GSM4768031_CT06_20190710_AR04.csv
filename: GSM4768032_CT06_20190719_AR17.csv
filename: GSM4768033_CT06_20190719_AR18.csv
filename: GSM4768034_CT06_20190814_AR19.csv
filename: GSM4768035_CT06_20190814_AR20.csv
filename: GSM4768036_CT10_20190524_AR03.csv
filename: GSM4768037_CT10_20190524_AR04.csv
filename: GSM4768038_CT10_20190610_AR01.csv
filename: GSM4768039_CT10_20190610_AR02.csv
filename: GSM4768040_CT10_20190704_AR04.csv
filename: GSM4768041_CT10_20190704_AR05.csv
filename: GSM4768042_CT10_201907

In [45]:
expected_rows = 8060
print(df_LD.shape)
assert df_LD.shape[1] == len(genes_LD)
assert df_DD.shape[1] == len(genes_DD)
assert df_LD.shape[0] + df_DD.shape[0] == expected_rows
print('data size tests passed')

(3453, 15743)
data size tests passed


In [46]:
for col in df_LD.columns:
    if col not in genes_LD:
        print(f"{col} not in zt_genes")
        break
else:
    print('all genes found for LD')
    
for col in df_DD.columns:
    if col not in genes_DD:
        print(f"{col} not in ct_genes")
        break
else:
    print('all genes found for DD')

all genes found for LD
all genes found for DD


Now, try to create tables

In [41]:
try:
    df_DD.to_sql(name='DD_experiments',
                 con=engine,
                 index=True,
                 if_exists='replace',
                 index_label='single_cell')
except Exception as e:
    print(str(e).split('\n')[0])

(MySQLdb.OperationalError) (1117, 'Too many columns')


This way of representing data didn't work. Maybe transposing will do the trick

In [43]:
df_DD_T = df_DD.T.copy()
try:
    df_DD_T.to_sql(name='DD_experiments',
                   con=engine,
                   index=True,
                   if_exists='replace',
                   index_label='single_cell')
except Exception as e:
    print(str(e).split('\n')[0])

(MySQLdb.OperationalError) (1117, 'Too many columns')


This didn't work either. We need to break data down into smaller tables

'GSM4768020_CT02_20190528_AR05.csv'