# RQ3 - Filter Sequential Patterns

Uses:

- resources/seq_patterns/substitutions.csv (from prepare/historical_seqpatterns_format.ipynb **manual**)
- resources/historical_join_db.xlsx (HISTORICAL_FILE_JOIN_DB from prepare/historic_count_models.ipynb)

Generates:

- patterns.tex (tab:patterns)
- substitutions.tex (tab:substitutions)
- substitutes.tex (tab:substitutes)

Variables:

- rq3_replacements
- rq3_min_support
- rq3_max_support
- rq3_support_values
- rq3_total_replacements
- rq3_total_replacements_projects
- rq3_relational_replacements
- rq3_relational_replacements_projects
- rq3_nonrelational_replacements
- rq3_nonrelational_replacements_projects
- rq3_both_replacements
- rq3_both_replacements_projects
- rq3_same_replacements
- rq3_same_replacements_projects
- rq3_only_same_replacements_projects
- rq3_non_relational_only_replacements_projects
- rq3_repl_hypersql
- rq3_repl_hypersql_projects
- rq3_repl_hypersql_dbms
- rq3_repl_hypersql_redis
- rq3_repl_hypersql_redis_ext
- rq3_repl_hypersql_mysql
- rq3_repl_hypersql_postgresql
- rq3_repl_mysql
- rq3_repl_mysql_projects 
- rq3_repl_mysql_redis
- rq3_repl_hbase
- rq3_repl_hbasel_projects
- rq3_repl_hbase_hypersql
- rq3_apache_camel_hbase
- rq3_replby_redis
- rq3_replby_redis_projects
- rq3_replby_oracle
- rq3_replby_oracle_projects
- rq3_replby_postgresql
- rq3_replby_postgres_projects
- rq3_replby_hypersql
- rq3_replby_hypersql_projects
- rq3_replby_mysql
- rq3_replby_mysql_projects
- rq3_replby_hbase
- rq3_replby_hbase_projects
- rq3_replby_etcd
- rq3_replby_etcd_projects

In [1]:
import sys  
sys.path.insert(1, '..')

In [2]:
import os
import functools
import re

import pandas as pd
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from num2words import num2words

from util import SEQ_PATTERNS_DIR, IMAGES_DIR, HISTORICAL_FILE_JOIN_DB
from analysis_helpers import var, relative_var, load_vars
from analysis_helpers import (
    RELATIONAL_ONLY_DBS, NONRELATIONAL_ONLY_DBS,
    MULTIMODEL_RELATIONAL, MULTIMODEL_NONRELATIONAL,
    RELATIONAL_DBS, NONRELATIONAL_DBS, MULTIMODEL_DBS
)

def union_sets(series):
    return functools.reduce(lambda x,y: x | y, series, set())

def union_values(series):
    return functools.reduce(lambda x,y: x | (y if isinstance(y, set) else {y}), series, set())

In [3]:
df = pd.read_csv(SEQ_PATTERNS_DIR + os.sep + "substitutions.csv")
# Remove meta-rules
df.drop(df[
    (df["Name"] == "X -> InY & OutX -> Y")
    | (df["Name"] == "X -> InY -> OutX & Y") 
].index, inplace=True)
# Identify DBMSs

replacements = {
    "Join_Ignite_NoSql": "Ignite-NoSql",
    "_": " ",
    "GoogleCloudDatastore": "Google Cloud Datastore",
    "SAPAdaptiveServer": "SAP Adaptive Server",
    "SAPSQLAnywhere": "SAP SQL Anywhere",
    "MicrosoftAzureCosmosDB": "Microsoft Azure CosmosDB",
    "FirebaseRealtime": "Firebase Realtime",
}
df["X"] = df["Bindings"].str.extract(r'X = (.+)?; Y =', expand=False).replace(replacements, regex=True)
df["Y"] = df["Bindings"].str.extract(r'Y = (.+)?', expand=False).replace(replacements, regex=True)
df["LinesSet"] = df["Lines"].apply(lambda x: set(x.split("; ")))
df.head()

Unnamed: 0,Name,Support,Line Count,Lines,Bindings,Source,X,Y,LinesSet
1,PostgreSQL -> InOracle & OutPostgreSQL -> Oracle,0.012876,3,76; 13; 23,X = PostgreSQL; Y = Oracle,X -> InY & OutX -> Y -v X -v Y,PostgreSQL,Oracle,"{13, 23, 76}"
2,IBM_DB2 -> InHyperSQL & OutIBM_DB2 -> HyperSQL,0.004292,1,68,X = IBM_DB2; Y = HyperSQL,X -> InY & OutX -> Y -v X -v Y,IBM DB2,HyperSQL,{68}
3,MySQL -> InPostgreSQL & OutMySQL -> PostgreSQL,0.004292,1,56,X = MySQL; Y = PostgreSQL,X -> InY & OutX -> Y -v X -v Y,MySQL,PostgreSQL,{56}
4,MySQL -> InIBM_DB2 & OutMySQL -> IBM_DB2,0.004292,1,86,X = MySQL; Y = IBM_DB2,X -> InY & OutX -> Y -v X -v Y,MySQL,IBM DB2,{86}
5,MySQL -> InOracle & OutMySQL -> Oracle,0.004292,1,23,X = MySQL; Y = Oracle,X -> InY & OutX -> Y -v X -v Y,MySQL,Oracle,{23}


In [4]:
def categorize_model(dbms):
    if dbms in RELATIONAL_DBS:
        return "Relational"
    elif dbms in NONRELATIONAL_DBS:
        return "Non Relational"
    else:
        print("Check: ", dbms)
    
df["X Model"] = df["X"].apply(categorize_model)
df["Y Model"] = df["Y"].apply(categorize_model)

### DBMSs that were replaced by others

In [5]:
df3 = pd.read_excel(HISTORICAL_FILE_JOIN_DB, keep_default_na=False)
renomeacoes = {
    'MySQL_Maria DB': 'MySQL',
    'PostgreSQL_ CockroachDB': 'PostgreSQL',
    'MS SQL Server_Microsoft Azure SQL Database': 'MS SQL Server',
    'MS_SQL_Server': 'MS SQL Server',
    'IBM_DB2': 'IBM DB2',
    'Virtuoso_NoSql': 'Virtuoso-NoSql',
    'Virtuoso_Sql': 'Virtuoso-Sql',
    'Ignite_NoSql': 'Ignite-NoSql',
    'Ignite_Sql': 'Ignite-Sql',
    'GoogleCloudFilestore': 'GoogleCloudFirestore',
    'MS_Access': 'MS Access'
}

# Renomeando os valores na coluna 'banco_de_dados'
df3['Databases'] = df3['Databases'].replace(renomeacoes)
df3 = df3.set_index("Databases")
dbms_counts = (df3.iloc[:,3:].T > 0).sum().to_dict()

In [6]:
df4 = df.groupby(["X"], as_index=False).agg(replacements=("Y", union_values), projects=("LinesSet", union_sets))
df4["# Replacer DBMSs"] = df4["replacements"].apply(len)
df4.rename(columns={"X": "Replaced DBMS"}, inplace=True)

#df4.rename(columns={"X": "DBMS", "replacements": "# Replacements"}, inplace=True)
df4["# Projects"] = df4["projects"].apply(len)
df4["% Projects"] = df4.apply(lambda row: row["# Projects"] / dbms_counts[row["Replaced DBMS"]], axis=1)
df4.sort_values(["% Projects", "# Replacer DBMSs", "# Projects"], inplace=True, ascending=False)
dbms_was_replaced_count = df4.set_index("Replaced DBMS")["# Replacer DBMSs"].to_dict()
dbms_was_replaced_in_projects = df4.set_index("Replaced DBMS")["# Projects"].to_dict()
df4

Unnamed: 0,Replaced DBMS,replacements,projects,# Replacer DBMSs,# Projects,% Projects
2,Couchbase,"{Neo4j, Snowflake, Cassandra, Influx DB, Hazel...","{208, 76, 59}",13,3,0.333333
27,SAP Adaptive Server,"{Etcd, Cassandra, HyperSQL, HBase, DynamoDB, I...","{68, 59, 151, 62}",10,4,0.333333
14,Informix,"{Neo4j, Etcd, Cassandra, Redis, HyperSQL, Orac...","{68, 27, 115, 159, 62}",9,5,0.333333
1,CouchDB,"{GoogleCloudFirestore, H2, Influx DB, Microsof...",{76},5,1,0.333333
15,Ingres,"{MongoDB, HBase, PostGIS, MS SQL Server, Terad...","{113, 151}",8,2,0.285714
12,Ignite-NoSql,"{MongoDB, Neo4j, Snowflake, Influx DB, Hazelca...","{74, 59}",10,2,0.25
25,Realm,"{MongoDB, MySQL, MarkLogic, PostgreSQL}",{147},4,1,0.25
10,HyperSQL,"{Neo4j, PostGIS, Redis, HBase, MS SQL Server, ...","{38, 27, 48, 86, 3, 112, 219, 44, 210, 188, 10...",18,19,0.234568
5,Firebird,"{Neo4j, Cassandra, Etcd, IBM DB2}","{151, 62, 159}",4,3,0.214286
8,HBase,"{PostGIS, Redis, Influx DB, SAP SQL Anywhere, ...","{68, 48, 44, 151, 39}",18,5,0.208333


In [7]:
assert max(dbms_was_replaced_count.items(), key=lambda x: x[1]) == ('HyperSQL', 18), "Paper: 'In summary, we have identified that HyperSQL is the DBMS mostly susceptible to replacements during the projects' history.'"
assert list(df4.set_index("Replaced DBMS")["# Replacer DBMSs"].nlargest(2).index) == ["HyperSQL", "HBase"], "Paper: 'In a more comprehensive analysis, considering only the replaced DBMS, including the patterns that occurred in only 1 project, we found HyperSQL and HBase among the most susceptible to be replaced.'"
assert list(df4.set_index("Replaced DBMS")["# Replacer DBMSs"].nlargest(2).index) == ["HyperSQL", "HBase"], r"Paper: 'Comparing Tables~\ref{tab:substitutions} and \ref{tab:substitutes}, we observe that both HyperSQL and HBase were replaced by other \nvarc{rq3_repl_hypersql}{18} DBMSs and replaced other \nvarc{rq3_replby_hypersql}{11} DBMSs, indicating a decrease in their adoption.'"

assert (df4["# Replacer DBMSs"] > df4["# Projects"]).sum()/len(df4) > 0.8, "Paper: 'Additionally, we observed that most replaced DBMSs have experienced more than one replacement in certain projects.'"

In [8]:
columns = ["Replaced DBMS", "# Replacer DBMSs", "# Projects", "% Projects"]

result = df4[columns].to_latex(index=False, float_format="{:.1%}".format)
result = result.replace("%", "\\%")
result = re.sub(r"\\toprule\n.*\n\\midrule", r"""\\hline
\\multicolumn{1}{l}{\\textbf{Replaced DBMS}} & \\multicolumn{1}{r}{\\textbf{\\# Replacer DBMSs}} & \\multicolumn{1}{c}{\\textbf{\\# Projects}} & \\textbf{\\% Projects} \\\\
\\hline""", result, flags=re.MULTILINE).replace(r'\bottomrule', r'\hline')
#result = result.replace(r'<\#i\#>', r'\textbf{').replace(r'<\#f\#>', r'}').replace(r'\toprule', r'\hline').replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline')
print(result)

#result = df4[columns].to_latex(index=False, float_format="{:.1%}".format, header=[f"<#i#>{column}<#f#>" for column in columns])
#result = result.replace(r'<\#i\#>', r'\textbf{').replace(r'<\#f\#>', r'}').replace(r'\toprule', r'\hline').replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline')
with open(IMAGES_DIR + os.sep + "substitutions.tex", "w") as f:
    f.write(result)

\begin{tabular}{lrrr}
\hline
\multicolumn{1}{l}{\textbf{Replaced DBMS}} & \multicolumn{1}{r}{\textbf{\# Replacer DBMSs}} & \multicolumn{1}{c}{\textbf{\# Projects}} & \textbf{\% Projects} \\
\hline
Couchbase & 13 & 3 & 33.3\% \\
SAP Adaptive Server & 10 & 4 & 33.3\% \\
Informix & 9 & 5 & 33.3\% \\
CouchDB & 5 & 1 & 33.3\% \\
Ingres & 8 & 2 & 28.6\% \\
Ignite-NoSql & 10 & 2 & 25.0\% \\
Realm & 4 & 1 & 25.0\% \\
HyperSQL & 18 & 19 & 23.5\% \\
Firebird & 4 & 3 & 21.4\% \\
HBase & 18 & 5 & 20.8\% \\
Hazelcast & 8 & 7 & 20.6\% \\
SQLite & 15 & 8 & 20.0\% \\
Teradata & 10 & 2 & 18.2\% \\
SapHana & 4 & 2 & 18.2\% \\
Neo4j & 4 & 3 & 17.6\% \\
IBM DB2 & 8 & 7 & 17.1\% \\
MS SQL Server & 13 & 11 & 16.4\% \\
Cassandra & 14 & 5 & 16.1\% \\
MS Access & 5 & 2 & 15.4\% \\
Influx DB & 1 & 1 & 14.3\% \\
Firebase Realtime & 2 & 1 & 12.5\% \\
Google Cloud Datastore & 12 & 5 & 11.6\% \\
Snowflake & 1 & 1 & 11.1\% \\
PostgreSQL & 14 & 11 & 10.8\% \\
H2 & 14 & 12 & 10.5\% \\
MySQL & 13 & 13 & 9.5\% \\
PostGI

### DBMS that replaced others

In [9]:
df5 = df.groupby(["Y"], as_index=False).agg(replacements=("X", union_values), projects=("LinesSet", union_sets))
df5["# Replaced DBMSs"] = df5["replacements"].apply(len)
df5.rename(columns={"Y": "Replacer DBMS"}, inplace=True)
df5["# Projects"] = df5["projects"].apply(len)
df5["% Projects"] = df5.apply(lambda row: row["# Projects"] / dbms_counts[row["Replacer DBMS"]], axis=1)
df5.sort_values(["% Projects", "# Replaced DBMSs", "# Projects"], inplace=True, ascending=False)
dbms_replaced_count = df5.set_index("Replacer DBMS")["# Replaced DBMSs"].to_dict()
dbms_replaced_in_projects = df5.set_index("Replacer DBMS")["# Projects"].to_dict()
df5

Unnamed: 0,Replacer DBMS,replacements,projects,# Replaced DBMSs,# Projects,% Projects
20,Microsoft Azure CosmosDB,"{SQLite, HBase, Ignite-NoSql, CouchDB, Cassand...","{76, 59, 44, 51}",7,4,0.571429
29,SAP SQL Anywhere,"{SQLite, HBase}",{39},2,1,0.5
15,Influx DB,"{HBase, CouchDB, Ignite-NoSql, Couchbase, Haze...","{74, 44, 76}",5,3,0.428571
0,ArangoDB,"{H2, HBase}","{44, 72}",2,2,0.4
5,Etcd,"{Snowflake, Firebird, Cassandra, HyperSQL, HBa...","{68, 44, 62}",11,3,0.375
1,Cassandra,"{Neo4j, Firebird, Redis, HBase, MS SQL Server,...","{83, 76, 30, 44, 47, 188, 72, 59, 159, 57, 69}",18,11,0.354839
3,Couchbase,"{Oracle, HBase, PostgreSQL, SAP Adaptive Server}","{44, 76, 59}",4,3,0.333333
8,GoogleCloudFirestore,"{Couchbase, CouchDB}",{76},2,1,0.333333
23,Neo4j,"{Firebird, Ignite-NoSql, Cassandra, HyperSQL, ...","{72, 159, 59, 74, 44}",12,5,0.294118
17,Ingres,"{SAP Adaptive Server, SapHana, MS SQL Server}","{154, 151}",3,2,0.285714


In [10]:
assert set(df5.set_index("Replacer DBMS")["# Replaced DBMSs"].nlargest(2).index) == {'Cassandra', 'MS SQL Server'}, "Paper: Some DBMSs replaced many distinct DBMSs. For instance, MS SQL Server and Cassandra replaced, each, 18 distinct DBMSs in 19 and 11 projects, respectively. However, from the viewpoint of the number of projects where the replacement occurred,"
assert list(df5.set_index("Replacer DBMS")["# Replaced DBMSs"].nlargest(2).unique()) == [18], "Paper: Some DBMSs replaced many distinct DBMSs. For instance, MS SQL Server and Cassandra replaced, each, 18 distinct DBMSs in 19 and 11 projects, respectively. However, from the viewpoint of the number of projects where the replacement occurred,"
assert dbms_replaced_in_projects["Cassandra"] == 11, "Paper: Some DBMSs replaced many distinct DBMSs. For instance, MS SQL Server and Cassandra replaced, each, 18 distinct DBMSs in 19 and 11 projects, respectively. However, from the viewpoint of the number of projects where the replacement occurred,"
assert dbms_replaced_in_projects["MS SQL Server"] == 19, "Paper: Some DBMSs replaced many distinct DBMSs. For instance, MS SQL Server and Cassandra replaced, each, 18 distinct DBMSs in 19 and 11 projects, respectively. However, from the viewpoint of the number of projects where the replacement occurred,"

In [11]:
assert dbms_replaced_in_projects["Redis"] == 22, r"Paper: Redis \new{(which replaced 15 DBMSs in 22 projects)}"
assert dbms_replaced_count["Redis"] == 15, r"Paper: Redis \new{(which replaced 15 DBMSs in 22 projects)}"
assert dbms_replaced_in_projects["PostgreSQL"] == 24, r"Paper: PostgreSQL \new{(which replaced 17 DBMSs in 24 projects)}"
assert dbms_replaced_count["PostgreSQL"] == 17, r"Paper: PostgreSQL \new{(which replaced 17 DBMSs in 24 projects)}"
assert list(df5.set_index("Replacer DBMS")["# Projects"].nlargest(2).index) == ["PostgreSQL", "Redis"], r"Paper: However, from the viewpoint of the number of projects where the replacement occurred,} the DBMSs most chosen to replace others with are Redis \new{(which replaced 15 DBMSs in 22 projects)} and PostgreSQL \new{(which replaced 17 DBMSs in 24 projects)}."
assert list(df5.set_index("Replacer DBMS")["# Projects"].nlargest(2).index) == ["PostgreSQL", "Redis"], r"Paper: Also, considering just the \new{number of projects}, we found Redis and PostgreSQL among the most used as a replacement"

In [12]:
columns = ["Replacer DBMS", "# Replaced DBMSs", "# Projects", "% Projects"]
result = df5[columns].to_latex(index=False, float_format="{:.1%}".format)
result = result.replace("%", "\\%")
result = re.sub(r"\\toprule\n.*\n\\midrule", r"""\\hline
\\multicolumn{1}{l}{\\textbf{Replacer DBMS}} & \\multicolumn{1}{r}{\\textbf{\\# Replaced DBMSs}} & \\multicolumn{1}{c}{\\textbf{\\# Projects}} & \\textbf{\\% Projects} \\\\
\\hline""", result, flags=re.MULTILINE).replace(r'\bottomrule', r'\hline')
#result = result.replace(r'<\#i\#>', r'\textbf{').replace(r'<\#f\#>', r'}').replace(r'\toprule', r'\hline').replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline')
print(result)
with open(IMAGES_DIR + os.sep + "substitutes.tex", "w") as f:
    f.write(result)

\begin{tabular}{lrrr}
\hline
\multicolumn{1}{l}{\textbf{Replacer DBMS}} & \multicolumn{1}{r}{\textbf{\# Replaced DBMSs}} & \multicolumn{1}{c}{\textbf{\# Projects}} & \textbf{\% Projects} \\
\hline
Microsoft Azure CosmosDB & 7 & 4 & 57.1\% \\
SAP SQL Anywhere & 2 & 1 & 50.0\% \\
Influx DB & 5 & 3 & 42.9\% \\
ArangoDB & 2 & 2 & 40.0\% \\
Etcd & 11 & 3 & 37.5\% \\
Cassandra & 18 & 11 & 35.5\% \\
Couchbase & 4 & 3 & 33.3\% \\
GoogleCloudFirestore & 2 & 1 & 33.3\% \\
Neo4j & 12 & 5 & 29.4\% \\
Ingres & 3 & 2 & 28.6\% \\
MS SQL Server & 18 & 19 & 28.4\% \\
MongoDB & 15 & 14 & 27.5\% \\
PostGIS & 5 & 3 & 27.3\% \\
HBase & 11 & 6 & 25.0\% \\
Ignite-NoSql & 2 & 2 & 25.0\% \\
Redis & 15 & 22 & 24.4\% \\
PostgreSQL & 17 & 24 & 23.5\% \\
MarkLogic & 5 & 3 & 23.1\% \\
Oracle & 17 & 19 & 20.2\% \\
DynamoDB & 10 & 6 & 18.2\% \\
ClickHouse & 5 & 2 & 18.2\% \\
Teradata & 2 & 2 & 18.2\% \\
H2 & 15 & 19 & 16.7\% \\
Hazelcast & 10 & 5 & 14.7\% \\
Google Cloud Datastore & 9 & 6 & 14.0\% \\
HyperSQL & 11 & 

### Replacement patterns table

Remove rules with support < 3

In [13]:
df2 = df.drop(df[df["Line Count"] < 3].index).copy()
df2 = df2.sort_values(by=["X", "Line Count"], ascending=[True, False])
df2

Unnamed: 0,Name,Support,Line Count,Lines,Bindings,Source,X,Y,LinesSet,X Model,Y Model
93,Cassandra -> InPostgreSQL -> OutCassandra & Po...,0.012876,3,59; 5; 47,X = Cassandra; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,Cassandra,PostgreSQL,"{47, 5, 59}",Non Relational,Relational
187,Cassandra -> InMS_SQL_Server -> OutCassandra &...,0.012876,3,68; 69; 47,X = Cassandra; Y = MS_SQL_Server,X -> InY -> OutX & Y -v X -v Y,Cassandra,MS SQL Server,"{68, 47, 69}",Non Relational,Relational
219,Couchbase -> InPostgreSQL -> OutCouchbase & Po...,0.012876,3,208; 59; 76,X = Couchbase; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,Couchbase,PostgreSQL,"{208, 76, 59}",Non Relational,Relational
182,HBase -> InHyperSQL -> OutHBase & HyperSQL,0.017167,4,48; 44; 68; 39,X = HBase; Y = HyperSQL,X -> InY -> OutX & Y -v X -v Y,HBase,HyperSQL,"{68, 44, 48, 39}",Non Relational,Relational
120,HBase -> InH2 -> OutHBase & H2,0.012876,3,44; 68; 39,X = HBase; Y = H2,X -> InY -> OutX & Y -v X -v Y,HBase,H2,"{68, 44, 39}",Non Relational,Relational
181,HBase -> InMS_SQL_Server -> OutHBase & MS_SQL_...,0.012876,3,151; 44; 68,X = HBase; Y = MS_SQL_Server,X -> InY -> OutX & Y -v X -v Y,HBase,MS SQL Server,"{68, 44, 151}",Non Relational,Relational
183,HBase -> InRedis -> OutHBase & Redis,0.012876,3,151; 44; 68,X = HBase; Y = Redis,X -> InY -> OutX & Y -v X -v Y,HBase,Redis,"{68, 44, 151}",Non Relational,Non Relational
78,Hazelcast -> InHyperSQL -> OutHazelcast & Hype...,0.012876,3,0; 219; 109,X = Hazelcast; Y = HyperSQL,X -> InY -> OutX & Y -v X -v Y,Hazelcast,HyperSQL,"{219, 109, 0}",Non Relational,Relational
83,HyperSQL -> InRedis -> OutHyperSQL & Redis,0.034335,8,3; 38; 44; 112; 210; 115; 20; 219,X = HyperSQL; Y = Redis,X -> InY -> OutX & Y -v X -v Y,HyperSQL,Redis,"{38, 3, 112, 219, 44, 210, 20, 115}",Relational,Non Relational
62,HyperSQL -> InPostgreSQL -> OutHyperSQL & Post...,0.021459,5,0; 100; 148; 86; 109,X = HyperSQL; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,HyperSQL,PostgreSQL,"{86, 148, 109, 0, 100}",Relational,Relational


In [14]:
print("Number of replacements:", var("rq3_replacements", len(df2)))

Number of replacements: 18


In [15]:
latex_table = [
   # r"\begin{tabular}{lrr}",
    r"\begin{tabular}{lr}",
    r"\hline",
   # r"\textbf{Pattern} & \textbf{Support} & \textbf{Total}\\",
    r"\textbf{Pattern} & \textbf{Support} \\",
    r"\hline"
]
for group, subdf in df2.groupby("X"):
    subdf["Name"].replace({
        "->" : r"$\\rightarrow$",
        "([^{])In(.+?) ": r"\1\2$_{In}$ ",
        "([^{])In(.+?)$": r"\1\2$_{In}$",
        "([^{])Out(.+?) ": r"\1\2$_{Out}$ ",
        "([^{])Out(.+?)$": r"\1\2$_{Out}$",
        "([^$])_": r"\1 ",
        "&": r"  ",
    }, regex=True, inplace=True)
    for i, (_, row) in enumerate(subdf.iterrows()):
        values = [row["Name"], str(row["Line Count"])]
        #if i == 0:
        #    total = dbms_was_replaced_count[row["X"]]
        #    # total = subdf['Line Count'].sum() # Sum only filtered
        #    values.append(f"\\multirow{{{len(subdf)}}}{{*}}{{{total}}}")
        #else:
        #    values.append("")
        latex_table.append(" & ".join(values) + r" \\")
    latex_table.append(r"\hline")
latex_table.append(r"\end{tabular}")

with open(IMAGES_DIR + os.sep + "patterns.tex", "w") as f:
    f.write("\n".join(latex_table) + "\n")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  subdf["Name"].replace({


In [16]:
assert df[df["Name"] == "PostgreSQL -> InOracle & OutPostgreSQL -> Oracle"]["Line Count"].to_list() == [3], "Paper: 'An example is the pattern \textbf{$PostgreSQL \rightarrow Oracle_{In}$ $PostgreSQL_{Out} \rightarrow$} \textbf{$Oracle$} with support = 3. This indicates that three projects used PostgreSQL, then in a later slice started using Oracle and stopped using PostgreSQL, maintaining  Oracle in a later slice.'"

In [17]:
assert df[df["Name"] == "HyperSQL -> InRedis -> OutHyperSQL & Redis"]["Line Count"].to_list() == [8], "Paper: 'The pattern \textbf{$HyperSQL \rightarrow Redis_{In} \rightarrow HyperSQL_{Out}$ $Redis$} with support = 8 indicates that eight projects replaced HyperSQL by Redis.'"

In [18]:
print("Min support:", var("rq3_min_support", df2["Line Count"].min()))
print("Max support:", var("rq3_max_support", df2["Line Count"].max()))
print("Support values:", var("rq3_support_values", ', '.join(map(str, sorted(list(set(df2["Line Count"].to_list())))))))

Min support: 3
Max support: 8
Support values: 3, 4, 5, 8


### Data

In [19]:
total_replacements = replacements_among_relational = df.groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
total_replacements_count = len(total_replacements)
total_replacements_projects_count = len(union_sets(total_replacements['lines']))

print("Total number of replacements", var("rq3_total_replacements", total_replacements_count))
print("Projects with replacements", var("rq3_total_replacements_projects",  total_replacements_projects_count))

Total number of replacements 296
Projects with replacements 67


In [20]:
replacements_among_relational = df[
    (df["X Model"] == "Relational") & (df["Y Model"] == "Relational")
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among relational:", relative_var("rq3_relational_replacements", len(replacements_among_relational), total_replacements_count))
print("Projects with replacements among relational:", relative_var("rq3_relational_replacements_projects", len(union_sets(replacements_among_relational['lines'])), total_replacements_projects_count))


Replacements among relational: 89 (30.1%)
Projects with replacements among relational: 44 (65.7%)


In [21]:
replacements_among_nonrelational = df[
    (df["X Model"] == "Non Relational") & (df["Y Model"] == "Non Relational")
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among non relational:", relative_var("rq3_nonrelational_replacements", len(replacements_among_nonrelational), total_replacements_count))
print("Projects with replacements among non relational:", relative_var("rq3_nonrelational_replacements_projects", len(union_sets(replacements_among_nonrelational['lines'])), total_replacements_projects_count))


Replacements among non relational: 66 (22.3%)
Projects with replacements among non relational: 21 (31.3%)


In [22]:
replacements_among_both = df[
    ((df["X Model"] == "Relational") & (df["Y Model"] == "Non Relational"))
    | ((df["X Model"] == "Non Relational") & (df["Y Model"] == "Relational"))
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among both:", relative_var("rq3_both_replacements", len(replacements_among_both), total_replacements_count))
print("Projects with replacements among both:", relative_var("rq3_both_replacements_projects", len(union_sets(replacements_among_both['lines'])), total_replacements_projects_count))


Replacements among both: 141 (47.6%)
Projects with replacements among both: 43 (64.2%)


In [23]:
replacements_among_same = df[
    ((df["X Model"] == "Relational") & (df["Y Model"] == "Relational"))
    | ((df["X Model"] == "Non Relational") & (df["Y Model"] == "Non Relational"))
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among same:", relative_var("rq3_same_replacements", len(replacements_among_same), total_replacements_count))
print("Projects with replacements among same:", relative_var("rq3_same_replacements_projects", len(union_sets(replacements_among_same['lines'])), total_replacements_projects_count))


Replacements among same: 155 (52.4%)
Projects with replacements among same: 56 (83.6%)


In [24]:
projects_with_only_replacements_of_the_same_model = union_sets(total_replacements['lines']) - union_sets(replacements_among_both['lines'])
print("Projects with replacements only among same model:", relative_var("rq3_only_same_replacements_projects", len(projects_with_only_replacements_of_the_same_model), total_replacements_projects_count))


Projects with replacements only among same model: 24 (35.8%)


In [25]:
nonrelational_only_replacements = projects_with_only_replacements_of_the_same_model & union_sets(replacements_among_nonrelational['lines'])
print("Projects with replacements only among non relational:", relative_var("rq3_non_relational_only_replacements_projects", len(nonrelational_only_replacements), len(projects_with_only_replacements_of_the_same_model)))


Projects with replacements only among non relational: 3 (12.5%)


In [26]:
assert df[df["Name"] == "Cassandra -> InPostgreSQL -> OutCassandra & PostgreSQL"]["Line Count"].to_list() == [3], "Paper: 'the replacements occur among distinct data models such as Cassandra replacing PostgreSQL in 3 projects'"

In [27]:
print("HyperSQL replacements:", var("rq3_repl_hypersql", dbms_was_replaced_count["HyperSQL"]))
print("HyperSQL replacements in projects:", var("rq3_repl_hypersql_projects", dbms_was_replaced_in_projects["HyperSQL"]))

HyperSQL replacements: 18
HyperSQL replacements in projects: 19


In [28]:
print("HyperSQL replacements in projects:", var("rq3_repl_hypersql_dbms", df[df["X"] == "HyperSQL"].groupby("X")["Y"].agg(other=set)["other"].apply(len).to_list()[0]))

HyperSQL replacements in projects: 18


In [29]:
print("HyperSQL replaced by Redis:", var("rq3_repl_hypersql_redis", df[(df["X"] == "HyperSQL") & (df["Y"] == "Redis")]["Line Count"].sum()))
print("HyperSQL replaced by Redis (ext):", var("rq3_repl_hypersql_redis_ext", num2words(df[(df["X"] == "HyperSQL") & (df["Y"] == "Redis")]["Line Count"].sum())))

HyperSQL replaced by Redis: 8
HyperSQL replaced by Redis (ext): eight


In [30]:
print("HyperSQL replaced by MySQL:", var("rq3_repl_hypersql_mysql", df[(df["X"] == "HyperSQL") & (df["Y"] == "MySQL")]["Line Count"].sum()))

HyperSQL replaced by MySQL: 6


In [31]:
print("HyperSQL replaced by PostgreSQL:", var("rq3_repl_hypersql_postgresql", df[(df["X"] == "HyperSQL") & (df["Y"] == "PostgreSQL")]["Line Count"].sum()))

HyperSQL replaced by PostgreSQL: 7


In [32]:
assert len(df[(df["X"] == "Redis") & (df["Y"] == "HyperSQL")]) == 0, "Paper: 'In the opposite direction, we did not find a pattern indicating HyperSQL replacing Redis'"

In [33]:
print("MySQL replacements:", var("rq3_repl_mysql", dbms_was_replaced_count["MySQL"]))
print("MySQL replacements in projects:", var("rq3_repl_mysql_projects", dbms_was_replaced_in_projects["MySQL"]))

MySQL replacements: 13
MySQL replacements in projects: 13


In [34]:
print("MySQL replaced by Redis:", var("rq3_repl_mysql_redis", num2words(df[(df["X"] == "MySQL") & (df["Y"] == "Redis")]["Line Count"].sum())))

MySQL replaced by Redis: four


In [35]:
assert len(df[(df["X"] == "Redis") & (df["Y"] == "MySQL")]) == 0, "Paper: 'The only MySQL replacement pattern that occurred frequently enough was its replacement by Redis (3 times), with no replacements in the opposite direction.'"

In [36]:
print("HBase replacements:", var("rq3_repl_hbase", dbms_was_replaced_count["HBase"]))
print("HBase replacements in projects:", var("rq3_repl_hbasel_projects", dbms_was_replaced_in_projects["HBase"]))

HBase replacements: 18
HBase replacements in projects: 5


In [37]:
print("HBase replaced by HyperSQL:", var("rq3_repl_hbase_hypersql", num2words(df[(df["X"] == "HBase") & (df["Y"] == "HyperSQL")]["Line Count"].sum())))

HBase replaced by HyperSQL: four


In [38]:
assert "Etcd" not in dbms_was_replaced_count, "Paper: 'On the other hand, Etcd was not replaced but substituted another DBMS'"

In [39]:
SEQUENTIAL_PROJECTS = SEQ_PATTERNS_DIR + os.sep + 'sequencial_project.txt'
with open(SEQUENTIAL_PROJECTS, "r") as f:
    projects = eval(f.read())
index = str(projects.index(('apache', 'camel')))

In [40]:
hbase_replacement_in_apache_camel = df[(df["X"] == "HBase") & (df["LinesSet"].apply(lambda x: index in x))]

In [41]:
assert len(hbase_replacement_in_apache_camel) == 14, "Paper: 'For example, in the Apache/Camel project, we discovered 14 distinct patterns of HBase substitutions for different DBMSs'"

In [42]:
result = list(hbase_replacement_in_apache_camel.groupby("Y")["LinesSet"].agg(lines=union_sets).index)
text = ', '.join(result[:-1]) + f", and {result[-1]}" 
print("DBMS with no removals:", var("rq3_apache_camel_hbase", text))

DBMS with no removals: ArangoDB, Cassandra, Couchbase, Etcd, H2, HyperSQL, Ignite-NoSql, Influx DB, MS SQL Server, Microsoft Azure CosmosDB, MySQL, PostGIS, PostgreSQL, and Redis


In [43]:
print("Redis replaces:", var("rq3_replby_redis", dbms_replaced_count["Redis"]))
print("Redis replaces in projects:", var("rq3_replby_redis_projects", dbms_replaced_in_projects["Redis"]))

Redis replaces: 15
Redis replaces in projects: 22


In [44]:
print("Oracle replaces:", var("rq3_replby_oracle", dbms_replaced_count["Oracle"]))
print("Oracle replaces in projects:", var("rq3_replby_oracle_projects", dbms_replaced_in_projects["Oracle"]))

Oracle replaces: 17
Oracle replaces in projects: 19


In [45]:
print("PostgreSQL replaces:", var("rq3_replby_postgresql", dbms_replaced_count["PostgreSQL"]))
print("PostgreSQL replaces in projects:", var("rq3_replby_postgres_projects", dbms_replaced_in_projects["PostgreSQL"]))

PostgreSQL replaces: 17
PostgreSQL replaces in projects: 24


In [46]:
print("HyperSQL replaces:", var("rq3_replby_hypersql", dbms_replaced_count["HyperSQL"]))
print("HyperSQL replaces in projects:", var("rq3_replby_hypersql_projects", dbms_replaced_in_projects["HyperSQL"]))

HyperSQL replaces: 11
HyperSQL replaces in projects: 11


In [47]:
print("MySQL replaces:", var("rq3_replby_mysql", dbms_replaced_count["MySQL"]))
print("MySQL replaces in projects:", var("rq3_replby_mysql_projects", dbms_replaced_in_projects["MySQL"]))

MySQL replaces: 13
MySQL replaces in projects: 18


In [48]:
print("HBase replaces:", var("rq3_replby_hbase", dbms_replaced_count["HBase"]))
print("HBase replaces in projects:", var("rq3_replby_hbase_projects", dbms_replaced_in_projects["HBase"]))

HBase replaces: 11
HBase replaces in projects: 6


In [49]:
print("Etcd replaces:", var("rq3_replby_etcd", dbms_replaced_count["Etcd"]))
print("Etcd replaces in projects:", var("rq3_replby_etcd_projects", dbms_replaced_in_projects["Etcd"]))

Etcd replaces: 11
Etcd replaces in projects: 3
