In [None]:
import numpy as np
import pandas as pd
import glob
import sqlite3
import re
import os
import resource

In [None]:
def read_fitness(csv_path,positive_string="positive",negative_string="negative"):
    base_name = re.sub("_mean_fitness_positive.csv","",os.path.basename(csv_path))
    print("Beginning to read and process fitnesses of ",base_name)
    this_datar = pd.read_csv(csv_path)\
        [["PPI","Mean_fitness","sd","Positive"]]
    print("Shape {}".format(this_datar.shape))
    positive_mean_fitness = this_datar.loc[this_datar['PPI']\
        .str.contains(positive_string)].filter(regex="Mean_fitness").mean()[0]
    negative_mean_fitness = this_datar.loc[this_datar['PPI']\
        .str.contains(negative_string)].filter(regex="Mean_fitness").mean()[0]
    print("Calculate negative fitness of",negative_mean_fitness,\
        "and positive of",positive_mean_fitness)
    print("Mean mean fitness before is",this_datar["Mean_fitness"].mean())
    this_datar["Normalized_Fitness"] = this_datar["Mean_fitness"]\
        .apply(lambda x: (x-negative_mean_fitness)/(positive_mean_fitness-negative_mean_fitness))
    print("Mean mean fitness after is",this_datar["Normalized_Fitness"].mean())
    this_datar["ORF1"], this_datar["ORF2"] = this_datar["PPI"].str.split("_",1).str
    this_datar["Experiment"] = base_name
    return this_datar

In [None]:
print("Opening a SQLite database at `fitness.sqlite` ")
db_fitness = sqlite3.connect("fitness.sqlite")

In [None]:
for this_csv in glob.glob('*positive.csv'):
    with db_fitness:
        read_fitness(this_csv,positive_string="positive",negative_string="negative")\
            .to_sql("fitness",db_fitness,if_exists="append")

In [None]:
print("Database on disk reports these heads for fitness:")
print(pd.read_sql_query("SELECT * FROM fitness LIMIT 5",db_fitness))

In [None]:
db_fitness.execute(" \
        CREATE TABLE n_positive AS \
        SELECT DISTINCT PPI,npos,\
                        SUBSTR(PPI,1,INSTR(PPI,'_')-1) AS ORF1,\
                        SUBSTR(PPI,INSTR(PPI,'_')+1,LENGTH(PPI)) AS ORF2 \
                        FROM \
            (SELECT PPI,ORF1,ORF2,npos FROM \
                (SELECT PPI,ORF1,ORF2,SUM(Positive) AS npos FROM \
                    ( SELECT PPI,ORF1,ORF2,Positive FROM \
                        fitness \
                        WHERE ORF1 like 'Y%' \
                        AND   ORF2 like 'Y%' \
                        AND   Experiment != 'SD2' \
                        AND   Experiment != 'SD' \
                        ) \
                    GROUP BY PPI \
                    ) \
                WHERE npos > 0 \
                ) \
    ")

In [None]:
print("Database on disk reports these heads for n_positive:")
print(pd.read_sql_query("SELECT * FROM n_positive LIMIT 5",db_fitness))

In [None]:
db_fitness.execute("\
        CREATE TABLE yorf_list AS \
            SELECT DISTINCT * \
            FROM ( \
                SELECT ORF1 AS YORF FROM fitness \
                UNION \
                SELECT ORF2 AS YORF FROM fitness \
                ) \
            WHERE YORF LIKE 'Y%' \
    ")

In [None]:
print("Database on disk reports these heads for yorf_list:")
print(pd.read_sql_query("SELECT * FROM yorf_list LIMIT 5",db_fitness))

In [None]:
db_fitness.execute("CREATE INDEX n_positive_PPI_index ON n_positive (PPI)")
db_fitness.execute("CREATE INDEX fitness_PPI_index ON fitness (PPI)")