# Export csv to aiven database

In [39]:
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import os
import pandas as pd

In [2]:
load_dotenv()
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASS')
db_port = 10184
database = "defaultdb"
host_url = rf"mysql://{db_user}:{db_password}@{db_host}:{db_port}/{database}"

engine = create_engine(host_url)

Now, read csv to pandas dataframe

In [9]:
annot_df = pd.read_csv('neuron_annotations.csv', index_col=0)
annot_df

Unnamed: 0,experiment,Repeats,condition,date,time,Idents
20181215_CLK856_LD_ZT14_AR07_ACAGGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181215_CLK856_LD_ZT14_AR07_CTTCTG,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR02_GTACCA,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR02_TCCTTC,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
20181231_CLK856_LD_ZT14_AR08_AGACAG,CLK856_LD,LD_1,LD,2018-12-31,ZT14,29:LPN
...,...,...,...,...,...,...
20190710_CLK856_DD_CT06_AR04_GTCTTC,CLK856_DD,DD_1,DD,2019-07-10,CT06,1:DN1p_CNMa
20190710_CLK856_DD_CT06_AR04_TCCTTC,CLK856_DD,DD_1,DD,2019-07-10,CT06,1:DN1p_CNMa
20190719_CLK856_DD_CT06_AR18_GTCTTC,CLK856_DD,DD_2,DD,2019-07-19,CT06,1:DN1p_CNMa
20190814_CLK856_DD_CT06_AR19_TCTGCA,CLK856_DD,DD_2,DD,2019-08-14,CT06,1:DN1p_CNMa


First, create table

In [20]:
ddl = """CREATE TABLE "annotations" (
        "single_cell" VARCHAR(255) PRIMARY KEY,
        "experiment" VARCHAR(255),
        "Repeats" VARCHAR(255),
        "condition" VARCHAR(255),
        "date" DATE,
        "time" VARCHAR(255),
        "Idents" VARCHAR(255)
        )"""  
print(ddl)

with engine.connect() as connection:
    connection.execute(text(ddl))

CREATE TABLE "annotations" (
        "single_cell" VARCHAR(255) PRIMARY KEY,
        "experiment" VARCHAR(255),
        "Repeats" VARCHAR(255),
        "condition" VARCHAR(255),
        "date" DATE,
        "time" VARCHAR(255),
        "Idents" VARCHAR(255)
        )


In [21]:
# annot_df.to_sql(name='annotations',
#                 con=engine,
#                 index=True,
#                 if_exists='append',
#                 index_label='single_cell')

2615

In [3]:
df_result = pd.read_sql("SELECT * FROM annotations", con=engine, index_col='single_cell')
df_result

Unnamed: 0_level_0,experiment,Repeats,condition,date,time,Idents
single_cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20181215_CLK856_LD_ZT14_AR07_ACAGAC,CLK856_LD,LD_2,LD,2018-12-15,ZT14,1:DN1p_CNMa
20181215_CLK856_LD_ZT14_AR07_ACAGGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,29:LPN
20181215_CLK856_LD_ZT14_AR07_ACCAAC,CLK856_LD,LD_2,LD,2018-12-15,ZT14,15:DN1p_CNMa
20181215_CLK856_LD_ZT14_AR07_ACCAGA,CLK856_LD,LD_2,LD,2018-12-15,ZT14,8:LN_ITP
20181215_CLK856_LD_ZT14_AR07_ACCATG,CLK856_LD,LD_2,LD,2018-12-15,ZT14,3:DN1a
...,...,...,...,...,...,...
20190814_CLK856_DD_CT06_AR20_TCACCA,CLK856_DD,DD_2,DD,2019-08-14,CT06,7:DN1p
20190814_CLK856_DD_CT06_AR20_TCCTTC,CLK856_DD,DD_2,DD,2019-08-14,CT06,9:LNd_NPF
20190814_CLK856_DD_CT06_AR20_TGAGAC,CLK856_DD,DD_2,DD,2019-08-14,CT06,3:DN1a
20190814_CLK856_DD_CT06_AR20_TGAGGA,CLK856_DD,DD_2,DD,2019-08-14,CT06,14:DN3


In [4]:
# Use the raw SQL query to get all tables
query = "SHOW TABLES"
tables_df = pd.read_sql(query, con=engine)
tables_df

Unnamed: 0,Tables_in_defaultdb
0,annotations
1,mytest


In [5]:
query = f"DESCRIBE annotations"
columns_df = pd.read_sql(query, con=engine)
columns_df

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,single_cell,varchar(255),NO,PRI,,
1,experiment,varchar(255),YES,,,
2,Repeats,varchar(255),YES,,,
3,condition,varchar(255),YES,,,
4,date,date,YES,,,
5,time,varchar(255),YES,,,
6,Idents,varchar(255),YES,,,


## Load experimental gene expression data

First, gather data from all files

In [28]:
with open('DD_genes.txt', 'r') as f:
    genes_DD = f.read().splitlines()

with open('LD_genes.txt', 'r') as f:
    genes_LD = f.read().splitlines()

print(len(genes_LD))
print(len(genes_DD))

15743
15650


In [67]:
DATA_PATH = r'../dataset/'
mapper = lambda x : x[1:] # removes leading 'x' char in idx strings
list_DD = []
list_LD = []
file_names = os.listdir(DATA_PATH)

for filename in file_names:
    print(f"filename: {filename}")
    table_name = filename.split('.')[0]
    new_df = pd.read_csv(f"{DATA_PATH}/{filename}",  
                         # na_values=["", "NaN", "NULL"], 
                         keep_default_na=False)
    new_df = new_df.rename(mapper, axis='columns')
    new_df = new_df.rename({'nnamed: 0': 'gene'}, axis='columns')

    # make table
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        gene VARCHAR(255) PRIMARY KEY,
        {'\n'.join(ddl.split('\n')[2:])};
    """
    print(create_table_query)

    with engine.connect() as connection:
        connection.execute(text(create_table_query))

    new_df.to_sql(name=table_name,
                  con=engine,
                  index=False,
                  if_exists='append')
    break
    
# df_LD = pd.concat(list_LD)
# df_DD = pd.concat(list_DD)

filename: GSM4768020_CT02_20190528_AR05.csv

    CREATE TABLE IF NOT EXISTS GSM4768020_CT02_20190528_AR05 (
        gene VARCHAR(255) PRIMARY KEY,
          "20190528_CLK856_DD_CT02_AR05_ACAGAC" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACAGGA" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACAGTG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACCAAC" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACCAGA" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACCATG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACGTAC" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACGTGA" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACGTTG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACTCAC" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACTCGA" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_ACTCTG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_AGACAG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_AGACCA" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_AGACTC" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_AGCTAG" INTEGER,
  "20190528_CLK856_DD_CT02_AR05_AGCTCA" INTEGER,
  "20190528_

IntegrityError: (MySQLdb.IntegrityError) (1062, "Duplicate entry 'Nos' for key 'GSM4768020_CT02_20190528_AR05.PRIMARY'")
[SQL: INSERT INTO "GSM4768020_CT02_20190528_AR05" (gene, "20190528_CLK856_DD_CT02_AR05_ACAGAC", "20190528_CLK856_DD_CT02_AR05_ACAGGA", "20190528_CLK856_DD_CT02_AR05_ACAGTG", "20190528_CLK856_DD_CT02_AR05_ACCAAC", "20190528_CLK856_DD_CT02_AR05_ACCAGA", "20190528_CLK856_DD_CT02_AR05_ACCATG", "20190528_CLK856_DD_CT02_AR05_ACGTAC", "20190528_CLK856_DD_CT02_AR05_ACGTGA", "20190528_CLK856_DD_CT02_AR05_ACGTTG", "20190528_CLK856_DD_CT02_AR05_ACTCAC", "20190528_CLK856_DD_CT02_AR05_ACTCGA", "20190528_CLK856_DD_CT02_AR05_ACTCTG", "20190528_CLK856_DD_CT02_AR05_AGACAG", "20190528_CLK856_DD_CT02_AR05_AGACCA", "20190528_CLK856_DD_CT02_AR05_AGACTC", "20190528_CLK856_DD_CT02_AR05_AGCTAG", "20190528_CLK856_DD_CT02_AR05_AGCTCA", "20190528_CLK856_DD_CT02_AR05_AGCTTC", "20190528_CLK856_DD_CT02_AR05_AGGAAG", "20190528_CLK856_DD_CT02_AR05_AGGACA", "20190528_CLK856_DD_CT02_AR05_AGGATC", "20190528_CLK856_DD_CT02_AR05_AGTGAG", "20190528_CLK856_DD_CT02_AR05_AGTGCA", "20190528_CLK856_DD_CT02_AR05_AGTGTC", "20190528_CLK856_DD_CT02_AR05_CAACAG", "20190528_CLK856_DD_CT02_AR05_CAACCA", "20190528_CLK856_DD_CT02_AR05_CAACTC", "20190528_CLK856_DD_CT02_AR05_CACTAG", "20190528_CLK856_DD_CT02_AR05_CACTCA", "20190528_CLK856_DD_CT02_AR05_CACTTC", "20190528_CLK856_DD_CT02_AR05_CAGAAG", "20190528_CLK856_DD_CT02_AR05_CAGACA", "20190528_CLK856_DD_CT02_AR05_CAGATC", "20190528_CLK856_DD_CT02_AR05_CATGAG", "20190528_CLK856_DD_CT02_AR05_CATGCA", "20190528_CLK856_DD_CT02_AR05_CATGTC", "20190528_CLK856_DD_CT02_AR05_CTAGAC", "20190528_CLK856_DD_CT02_AR05_CTAGGA", "20190528_CLK856_DD_CT02_AR05_CTAGTG", "20190528_CLK856_DD_CT02_AR05_CTCAAC", "20190528_CLK856_DD_CT02_AR05_CTCAGA", "20190528_CLK856_DD_CT02_AR05_CTCATG", "20190528_CLK856_DD_CT02_AR05_CTGTAC", "20190528_CLK856_DD_CT02_AR05_CTGTGA", "20190528_CLK856_DD_CT02_AR05_CTGTTG", "20190528_CLK856_DD_CT02_AR05_CTTCAC", "20190528_CLK856_DD_CT02_AR05_CTTCGA", "20190528_CLK856_DD_CT02_AR05_CTTCTG", "20190528_CLK856_DD_CT02_AR05_GAAGAC", "20190528_CLK856_DD_CT02_AR05_GAAGGA", "20190528_CLK856_DD_CT02_AR05_GAAGTG", "20190528_CLK856_DD_CT02_AR05_GACAAC", "20190528_CLK856_DD_CT02_AR05_GACAGA", "20190528_CLK856_DD_CT02_AR05_GACATG", "20190528_CLK856_DD_CT02_AR05_GAGTAC", "20190528_CLK856_DD_CT02_AR05_GAGTGA", "20190528_CLK856_DD_CT02_AR05_GAGTTG", "20190528_CLK856_DD_CT02_AR05_GATCAC", "20190528_CLK856_DD_CT02_AR05_GATCGA", "20190528_CLK856_DD_CT02_AR05_GATCTG", "20190528_CLK856_DD_CT02_AR05_GTACAG", "20190528_CLK856_DD_CT02_AR05_GTACCA", "20190528_CLK856_DD_CT02_AR05_GTACTC", "20190528_CLK856_DD_CT02_AR05_GTCTAG", "20190528_CLK856_DD_CT02_AR05_GTCTCA", "20190528_CLK856_DD_CT02_AR05_GTCTTC", "20190528_CLK856_DD_CT02_AR05_GTGAAG", "20190528_CLK856_DD_CT02_AR05_GTGACA", "20190528_CLK856_DD_CT02_AR05_GTGATC", "20190528_CLK856_DD_CT02_AR05_GTTGAG", "20190528_CLK856_DD_CT02_AR05_GTTGCA", "20190528_CLK856_DD_CT02_AR05_GTTGTC", "20190528_CLK856_DD_CT02_AR05_TCACAG", "20190528_CLK856_DD_CT02_AR05_TCACCA", "20190528_CLK856_DD_CT02_AR05_TCACTC", "20190528_CLK856_DD_CT02_AR05_TCCTAG", "20190528_CLK856_DD_CT02_AR05_TCCTCA", "20190528_CLK856_DD_CT02_AR05_TCCTTC", "20190528_CLK856_DD_CT02_AR05_TCGAAG", "20190528_CLK856_DD_CT02_AR05_TCGACA", "20190528_CLK856_DD_CT02_AR05_TCGATC", "20190528_CLK856_DD_CT02_AR05_TCTGAG", "20190528_CLK856_DD_CT02_AR05_TCTGCA", "20190528_CLK856_DD_CT02_AR05_TCTGTC", "20190528_CLK856_DD_CT02_AR05_TGAGAC", "20190528_CLK856_DD_CT02_AR05_TGAGGA", "20190528_CLK856_DD_CT02_AR05_TGAGTG", "20190528_CLK856_DD_CT02_AR05_TGCAAC", "20190528_CLK856_DD_CT02_AR05_TGCAGA", "20190528_CLK856_DD_CT02_AR05_TGCATG", "20190528_CLK856_DD_CT02_AR05_TGGTAC", "20190528_CLK856_DD_CT02_AR05_TGGTGA", "20190528_CLK856_DD_CT02_AR05_TGGTTG", "20190528_CLK856_DD_CT02_AR05_TGTCAC", "20190528_CLK856_DD_CT02_AR05_TGTCGA", "20190528_CLK856_DD_CT02_AR05_TGTCTG") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)]
[parameters: [('EGFP', 17, 0, 1, 0, 0, 2, 0, 0, 0, 5, 10, 0, 2, 36, 0, 3, 0, 0, 20, 0, 0, 0, 0, 35, 0, 0, 10, 0, 178, 0, 0, 0, 0, 1, 30, 0, 1, 0, 2, 2, 1, 0, 0, 0, 0, 0, 1, 1, 4, 0, 13, 33, 9, 0, 7, 1, 14, 0, 0, 0, 0, 0, 14, 2, 0, 144, 2, 1, 0, 0, 2, 0, 1, 0, 3, 151, 0, 1, 0, 0, 0, 0, 0, 6, 1, 3, 16, 0, 1, 0, 0, 0, 3, 21, 1, 32), ('ERCC-00002', 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), ('ERCC-00004', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1), ('ERCC-00022', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ('ERCC-00044', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ('ERCC-00046', 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ('ERCC-00060', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ('ERCC-00074', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)  ... displaying 10 of 15650 total bound parameter sets ...  ('CR45929', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ('CR46186', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [59]:
pd.read_sql("SHOW TABLES", con=engine)

Unnamed: 0,Tables_in_defaultdb
0,GSM4768020_CT02_20190528_AR05
1,GSM4768021_CT02_20190528_AR06
2,GSM4768022_CT02_20190528_AR07
3,GSM4768023_CT02_20190528_AR08
4,GSM4768024_CT02_20190702_AR13
5,GSM4768025_CT02_20190702_AR14
6,GSM4768026_CT02_20190702_AR15
7,GSM4768027_CT02_20190702_AR16
8,GSM4768028_CT06_20190710_AR01
9,GSM4768029_CT06_20190710_AR02
