# RQ3 - Filter Sequential Patterns

Uses:

- resources/seq_patterns/substitutions.csv (from prepare/historical_seqpatterns_format.ipynb **manual**)
- resources/historical_join_db.xlsx (HISTORICAL_FILE_JOIN_DB from prepare/historic_count_models.ipynb)

Generates:

- patterns.tex (tab:patterns)
- substitutions.tex (tab:substitutions)
- substitutes.tex (tab:substitutes)

Variables:

- rq3_replacements
- rq3_min_support
- rq3_max_support
- rq3_support_values
- rq3_total_replacements
- rq3_total_replacements_projects
- rq3_relational_replacements
- rq3_relational_replacements_projects
- rq3_nonrelational_replacements
- rq3_nonrelational_replacements_projects
- rq3_both_replacements
- rq3_both_replacements_projects
- rq3_same_replacements
- rq3_same_replacements_projects
- rq3_only_same_replacements_projects
- rq3_non_relational_only_replacements_projects
- rq3_repl_hypersql
- rq3_repl_hypersql_projects
- rq3_repl_hypersql_dbms
- rq3_repl_hypersql_redis
- rq3_repl_hypersql_redis_ext
- rq3_repl_hypersql_mysql
- rq3_repl_hypersql_postgresql
- rq3_repl_mysql
- rq3_repl_mysql_projects 
- rq3_repl_mysql_redis
- rq3_repl_hbase
- rq3_repl_hbasel_projects
- rq3_repl_hbase_hypersql
- rq3_apache_camel_hbase
- rq3_replby_redis
- rq3_replby_redis_projects
- rq3_replby_oracle
- rq3_replby_oracle_projects
- rq3_replby_postgresql
- rq3_replby_postgres_projects
- rq3_replby_hypersql
- rq3_replby_hypersql_projects
- rq3_replby_mysql
- rq3_replby_mysql_projects
- rq3_replby_hbase
- rq3_replby_hbase_projects
- rq3_replby_etcd
- rq3_replby_etcd_projects

In [1]:
import sys  
sys.path.insert(1, '..')

In [2]:
import os
import functools
import re

import pandas as pd
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from num2words import num2words

from util import SEQ_PATTERNS_DIR, PAPEROUT_DIR, HISTORICAL_FILE_JOIN_DB
from analysis_helpers import var, relative_var, load_vars
from analysis_helpers import (
    RELATIONAL_ONLY_DBS, NONRELATIONAL_ONLY_DBS,
    MULTIMODEL_RELATIONAL, MULTIMODEL_NONRELATIONAL,
    RELATIONAL_DBS, NONRELATIONAL_DBS, MULTIMODEL_DBS
)

def union_sets(series):
    return functools.reduce(lambda x,y: x | y, series, set())

In [3]:
df = pd.read_csv(SEQ_PATTERNS_DIR + os.sep + "substitutions.csv")
# Remove meta-rules
df.drop(df[
    (df["Name"] == "X -> InY & OutX -> Y")
    | (df["Name"] == "X -> InY -> OutX & Y") 
].index, inplace=True)
# Identify DBMSs

replacements = {
    "Join_Ignite_NoSql": "Ignite-NoSql",
    "_": " ",
    "GoogleCloudDatastore": "Google Cloud Datastore",
    "SAPAdaptiveServer": "SAP Adaptive Server",
    "SAPSQLAnywhere": "SAP SQL Anywhere",
    "MicrosoftAzureCosmosDB": "Microsoft Azure CosmosDB",
    "FirebaseRealtime": "Firebase Realtime",
}
df["X"] = df["Bindings"].str.extract(r'X = (.+)?; Y =', expand=False).replace(replacements, regex=True)
df["Y"] = df["Bindings"].str.extract(r'Y = (.+)?', expand=False).replace(replacements, regex=True)
df["LinesSet"] = df["Lines"].apply(lambda x: set(x.split("; ")))
df.head()

Unnamed: 0,Name,Support,Line Count,Lines,Bindings,Source,X,Y,LinesSet
1,MS_SQL_Server -> InHyperSQL & OutMS_SQL_Server...,0.004292,1,68,X = MS_SQL_Server; Y = HyperSQL,X -> InY & OutX -> Y -v X -v Y,MS SQL Server,HyperSQL,{68}
2,MS_SQL_Server -> InIBM_DB2 & OutMS_SQL_Server ...,0.004292,1,151,X = MS_SQL_Server; Y = IBM_DB2,X -> InY & OutX -> Y -v X -v Y,MS SQL Server,IBM DB2,{151}
3,HyperSQL -> InPostgreSQL & OutHyperSQL -> Post...,0.008584,2,115; 20,X = HyperSQL; Y = PostgreSQL,X -> InY & OutX -> Y -v X -v Y,HyperSQL,PostgreSQL,"{115, 20}"
4,HyperSQL -> InH2 & OutHyperSQL -> H2,0.008584,2,3; 20,X = HyperSQL; Y = H2,X -> InY & OutX -> Y -v X -v Y,HyperSQL,H2,"{20, 3}"
5,HyperSQL -> InIBM_DB2 & OutHyperSQL -> IBM_DB2,0.004292,1,48,X = HyperSQL; Y = IBM_DB2,X -> InY & OutX -> Y -v X -v Y,HyperSQL,IBM DB2,{48}


In [4]:
def categorize_model(dbms):
    if dbms in RELATIONAL_DBS:
        return "Relational"
    elif dbms in NONRELATIONAL_DBS:
        return "Non Relational"
    else:
        print("Check: ", dbms)
    
df["X Model"] = df["X"].apply(categorize_model)
df["Y Model"] = df["Y"].apply(categorize_model)

### DBMSs that were replaced by others

In [5]:
df3 = pd.read_excel(HISTORICAL_FILE_JOIN_DB, keep_default_na=False)
renomeacoes = {
    'MySQL_Maria DB': 'MySQL',
    'PostgreSQL_ CockroachDB': 'PostgreSQL',
    'MS SQL Server_Microsoft Azure SQL Database': 'MS SQL Server',
    'MS_SQL_Server': 'MS SQL Server',
    'IBM_DB2': 'IBM DB2',
    'Virtuoso_NoSql': 'Virtuoso-NoSql',
    'Virtuoso_Sql': 'Virtuoso-Sql',
    'Ignite_NoSql': 'Ignite-NoSql',
    'Ignite_Sql': 'Ignite-Sql',
    'GoogleCloudFilestore': 'GoogleCloudFirestore',
    'MS_Access': 'MS Access'
}

# Renomeando os valores na coluna 'banco_de_dados'
df3['Databases'] = df3['Databases'].replace(renomeacoes)
df3 = df3.set_index("Databases")
dbms_counts = (df3.iloc[:,3:].T > 0).sum().to_dict()

In [6]:
df4 = df.groupby(["X"], as_index=False).agg(replacements=("Y", "count"), projects=("LinesSet", union_sets))
df4.rename(columns={"X": "DBMS", "replacements": "# Replacements"}, inplace=True)
df4["# Projects"] = df4["projects"].apply(len)
df4["% Projects"] = df4.apply(lambda row: row["# Projects"] / dbms_counts[row["DBMS"]], axis=1)
df4.sort_values(["# Replacements", "# Projects"], inplace=True, ascending=False)
dbms_was_replaced_count = df4.set_index("DBMS")["# Replacements"].to_dict()
dbms_was_replaced_in_projects = df4.set_index("DBMS")["# Projects"].to_dict()
df4

Unnamed: 0,DBMS,# Replacements,projects,# Projects,% Projects
10,HyperSQL,23,"{148, 86, 219, 210, 133, 188, 27, 62, 48, 159,...",19,0.234568
8,HBase,18,"{151, 44, 48, 39, 68}",5,0.208333
20,MySQL,17,"{86, 208, 74, 136, 205, 6, 56, 27, 44, 23, 171...",13,0.094891
7,H2,17,"{169, 219, 55, 205, 160, 62, 194, 44, 100, 171...",12,0.105263
24,PostgreSQL,17,"{169, 220, 74, 148, 59, 159, 13, 133, 76, 23, 33}",11,0.107843
28,SQLite,17,"{171, 163, 39, 51, 112, 156, 27, 33}",8,0.2
17,MS SQL Server,16,"{151, 74, 114, 62, 159, 6, 133, 0, 100, 47, 68}",11,0.164179
0,Cassandra,16,"{5, 69, 59, 47, 68}",5,0.16129
6,Google Cloud Datastore,14,"{133, 201, 44, 59, 212}",5,0.116279
2,Couchbase,13,"{208, 76, 59}",3,0.333333


In [7]:
assert max(dbms_was_replaced_count.items(), key=lambda x: x[1]) == ('HyperSQL', 23), "Paper: 'In summary, we have identified that HyperSQL is the DBMS mostly susceptible to replacements during the projects' history.'"
assert list(df4.set_index("DBMS")["# Replacements"].nlargest(2).index) == ["HyperSQL", "HBase"], "Paper: 'In a more comprehensive analysis, considering only the replaced DBMS, including the patterns that occurred in only 1 project, we found HyperSQL and HBase among the most susceptible to be replaced.'"
assert (df4["# Replacements"] > df4["# Projects"]).sum()/len(df4) > 0.8, "Paper: 'Additionally, we observed that most replaced DBMSs have experienced more than one replacement in certain projects.'"

In [8]:
columns = ["DBMS", "# Replacements", "# Projects", "% Projects"]
result = df4[columns].to_latex(index=False, float_format="{:.1%}".format, header=[f"<#i#>{column}<#f#>" for column in columns])
result = result.replace(r'<\#i\#>', r'\textbf{').replace(r'<\#f\#>', r'}').replace(r'\toprule', r'\hline').replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline')
with open(PAPEROUT_DIR + os.sep + "substitutions.tex", "w") as f:
    f.write(result)

### DBMS that replaced others

In [9]:
df5 = df.groupby(["Y"], as_index=False).agg(replacements=("X", "count"), projects=("LinesSet", union_sets))
df5.rename(columns={"Y": "DBMS", "replacements": "# Times it Replaced a DBMS"}, inplace=True)
df5["# Projects"] = df5["projects"].apply(len)
df5["% Projects"] = df5.apply(lambda row: row["# Projects"] / dbms_counts[row["DBMS"]], axis=1)
df5.sort_values(["# Times it Replaced a DBMS", "# Projects"], inplace=True, ascending=False)
dbms_replaced_count = df5.set_index("DBMS")["# Times it Replaced a DBMS"].to_dict()
dbms_replaced_in_projects = df5.set_index("DBMS")["# Projects"].to_dict()
df5

Unnamed: 0,DBMS,# Times it Replaced a DBMS,projects,# Projects,% Projects
27,Redis,21,"{74, 219, 210, 205, 138, 132, 76, 188, 57, 68,...",22,0.244444
24,Oracle,21,"{148, 201, 208, 138, 13, 147, 76, 39, 136, 48,...",19,0.202128
26,PostgreSQL,20,"{148, 86, 219, 208, 59, 20, 138, 147, 56, 76, ...",24,0.235294
18,MS SQL Server,19,"{169, 220, 208, 147, 132, 76, 27, 39, 33, 92, ...",19,0.283582
21,MongoDB,18,"{74, 210, 138, 147, 133, 69, 112, 76, 44, 113,...",14,0.27451
1,Cassandra,18,"{83, 59, 159, 69, 30, 76, 44, 188, 47, 57, 72}",11,0.354839
9,H2,17,"{73, 208, 74, 210, 163, 76, 100, 72, 188, 39, ...",19,0.166667
22,MySQL,14,"{148, 86, 169, 74, 231, 147, 52, 100, 188, 33,...",18,0.131387
12,HyperSQL,14,"{219, 160, 48, 109, 69, 112, 44, 0, 171, 39, 68}",11,0.135802
23,Neo4j,12,"{74, 44, 59, 159, 72}",5,0.294118


In [10]:
assert list(df5.set_index("DBMS")["# Times it Replaced a DBMS"].nlargest(3).index) == ["Redis", "Oracle", "PostgreSQL"], "Paper: 'From this viewpoint, we discovered the DBMSs most chosen to replace others with are Redis, Oracle, and PostgreSQL.'"
assert list(df5.set_index("DBMS")["# Times it Replaced a DBMS"].nlargest(3).index) == ["Redis", "Oracle", "PostgreSQL"], "Paper: 'Also, considering just the DBMS that replaced another one, we found Redis, Oracle, and PostgreSQL among the most used as a replacement.'"

In [11]:
columns = ["DBMS", "# Times it Replaced a DBMS", "# Projects", "% Projects"]
result = df5[columns].to_latex(index=False, float_format="{:.1%}".format)
result = re.sub(r"\\toprule\n.*\n\\midrule", r"""\\hline
\\multicolumn{1}{l}{\\textbf{}} & \\multicolumn{1}{r}{\\textbf{\\# Times it Replaced}} & \\multicolumn{1}{c}{\\textbf{}} & \\textbf{} \\\\
\\multicolumn{1}{l}{\\textbf{DBMS}} & \\multicolumn{1}{r}{\\textbf{ a DBMS}} & \\multicolumn{1}{c}{\\textbf{\\# Projects}} & \\textbf{\\% Projects} \\\\
\\hline""", result, flags=re.MULTILINE).replace(r'\bottomrule', r'\hline')
#result = result.replace(r'<\#i\#>', r'\textbf{').replace(r'<\#f\#>', r'}').replace(r'\toprule', r'\hline').replace(r'\midrule', r'\hline').replace(r'\bottomrule', r'\hline')
print(result)
with open(PAPEROUT_DIR + os.sep + "substitutes.tex", "w") as f:
    f.write(result)

\begin{tabular}{lrrr}
\hline
\multicolumn{1}{l}{\textbf{}} & \multicolumn{1}{r}{\textbf{\# Times it Replaced}} & \multicolumn{1}{c}{\textbf{}} & \textbf{} \\
\multicolumn{1}{l}{\textbf{DBMS}} & \multicolumn{1}{r}{\textbf{ a DBMS}} & \multicolumn{1}{c}{\textbf{\# Projects}} & \textbf{\% Projects} \\
\hline
                   Redis &                          21 &          22 &       24.4\% \\
                  Oracle &                          21 &          19 &       20.2\% \\
              PostgreSQL &                          20 &          24 &       23.5\% \\
           MS SQL Server &                          19 &          19 &       28.4\% \\
                 MongoDB &                          18 &          14 &       27.5\% \\
               Cassandra &                          18 &          11 &       35.5\% \\
                      H2 &                          17 &          19 &       16.7\% \\
                   MySQL &                          14 &          18 &       13.1\% 

### Replacement patterns table

Remove rules with support < 3

In [12]:
df2 = df.drop(df[df["Line Count"] < 3].index).copy()
df2 = df2.sort_values(by=["X", "Line Count"], ascending=[True, False])
df2

Unnamed: 0,Name,Support,Line Count,Lines,Bindings,Source,X,Y,LinesSet,X Model,Y Model
97,Cassandra -> InPostgreSQL -> OutCassandra & Po...,0.012876,3,59; 5; 47,X = Cassandra; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,Cassandra,PostgreSQL,"{5, 47, 59}",Non Relational,Relational
190,Cassandra -> InMS_SQL_Server -> OutCassandra &...,0.012876,3,68; 69; 47,X = Cassandra; Y = MS_SQL_Server,X -> InY -> OutX & Y -v X -v Y,Cassandra,MS SQL Server,"{69, 47, 68}",Non Relational,Relational
214,Couchbase -> InPostgreSQL -> OutCouchbase & Po...,0.012876,3,208; 59; 76,X = Couchbase; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,Couchbase,PostgreSQL,"{208, 76, 59}",Non Relational,Relational
177,HBase -> InHyperSQL -> OutHBase & HyperSQL,0.017167,4,48; 44; 68; 39,X = HBase; Y = HyperSQL,X -> InY -> OutX & Y -v X -v Y,HBase,HyperSQL,"{48, 39, 44, 68}",Non Relational,Relational
120,HBase -> InH2 -> OutHBase & H2,0.012876,3,44; 68; 39,X = HBase; Y = H2,X -> InY -> OutX & Y -v X -v Y,HBase,H2,"{39, 44, 68}",Non Relational,Relational
176,HBase -> InMS_SQL_Server -> OutHBase & MS_SQL_...,0.012876,3,151; 44; 68,X = HBase; Y = MS_SQL_Server,X -> InY -> OutX & Y -v X -v Y,HBase,MS SQL Server,"{151, 44, 68}",Non Relational,Relational
179,HBase -> InRedis -> OutHBase & Redis,0.012876,3,151; 44; 68,X = HBase; Y = Redis,X -> InY -> OutX & Y -v X -v Y,HBase,Redis,"{151, 44, 68}",Non Relational,Non Relational
72,Hazelcast -> InHyperSQL -> OutHazelcast & Hype...,0.012876,3,0; 219; 109,X = Hazelcast; Y = HyperSQL,X -> InY -> OutX & Y -v X -v Y,Hazelcast,HyperSQL,"{219, 109, 0}",Non Relational,Relational
82,HyperSQL -> InRedis -> OutHyperSQL & Redis,0.034335,8,3; 38; 44; 112; 210; 115; 20; 219,X = HyperSQL; Y = Redis,X -> InY -> OutX & Y -v X -v Y,HyperSQL,Redis,"{219, 210, 38, 112, 115, 44, 20, 3}",Relational,Non Relational
48,HyperSQL -> InPostgreSQL -> OutHyperSQL & Post...,0.021459,5,0; 100; 148; 86; 109,X = HyperSQL; Y = PostgreSQL,X -> InY -> OutX & Y -v X -v Y,HyperSQL,PostgreSQL,"{148, 86, 109, 0, 100}",Relational,Relational


In [13]:
print("Number of replacements:", var("rq3_replacements", len(df2)))

Number of replacements: 18


In [14]:
latex_table = [
   # r"\begin{tabular}{lrr}",
    r"\begin{tabular}{lr}",
    r"\hline",
   # r"\textbf{Pattern} & \textbf{Support} & \textbf{Total}\\",
    r"\textbf{Pattern} & \textbf{Support} \\",
    r"\hline"
]
for group, subdf in df2.groupby("X"):
    subdf["Name"].replace({
        "->" : r"$\\rightarrow$",
        "([^{])In(.+?) ": r"\1\2$_{In}$ ",
        "([^{])In(.+?)$": r"\1\2$_{In}$",
        "([^{])Out(.+?) ": r"\1\2$_{Out}$ ",
        "([^{])Out(.+?)$": r"\1\2$_{Out}$",
        "([^$])_": r"\1 ",
        "&": r"  ",
    }, regex=True, inplace=True)
    for i, (_, row) in enumerate(subdf.iterrows()):
        values = [row["Name"], str(row["Line Count"])]
        #if i == 0:
        #    total = dbms_was_replaced_count[row["X"]]
        #    # total = subdf['Line Count'].sum() # Sum only filtered
        #    values.append(f"\\multirow{{{len(subdf)}}}{{*}}{{{total}}}")
        #else:
        #    values.append("")
        latex_table.append(" & ".join(values) + r" \\")
    latex_table.append("\hline")
latex_table.append("\end{tabular}")

with open(PAPEROUT_DIR + os.sep + "patterns.tex", "w") as f:
    f.write("\n".join(latex_table) + "\n")

In [15]:
assert df[df["Name"] == "PostgreSQL -> InOracle & OutPostgreSQL -> Oracle"]["Line Count"].to_list() == [3], "Paper: 'An example is the pattern \textbf{$PostgreSQL \rightarrow Oracle_{In}$ $PostgreSQL_{Out} \rightarrow$} \textbf{$Oracle$} with support = 3. This indicates that three projects used PostgreSQL, then in a later slice started using Oracle and stopped using PostgreSQL, maintaining  Oracle in a later slice.'"

In [16]:
assert df[df["Name"] == "HyperSQL -> InRedis -> OutHyperSQL & Redis"]["Line Count"].to_list() == [8], "Paper: 'The pattern \textbf{$HyperSQL \rightarrow Redis_{In} \rightarrow HyperSQL_{Out}$ $Redis$} with support = 8 indicates that eight projects replaced HyperSQL by Redis.'"

In [17]:
print("Min support:", var("rq3_min_support", df2["Line Count"].min()))
print("Max support:", var("rq3_max_support", df2["Line Count"].max()))
print("Support values:", var("rq3_support_values", ', '.join(map(str, sorted(list(set(df2["Line Count"].to_list())))))))

Min support: 3
Max support: 8
Support values: 3, 4, 5, 8


### Data

In [18]:
total_replacements = replacements_among_relational = df.groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
total_replacements_count = len(total_replacements)
total_replacements_projects_count = len(union_sets(total_replacements['lines']))

print("Total number of replacements", var("rq3_total_replacements", total_replacements_count))
print("Projects with replacements", var("rq3_total_replacements_projects",  total_replacements_projects_count))

Total number of replacements 296
Projects with replacements 67


In [19]:
replacements_among_relational = df[
    (df["X Model"] == "Relational") & (df["Y Model"] == "Relational")
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among relational:", relative_var("rq3_relational_replacements", len(replacements_among_relational), total_replacements_count))
print("Projects with replacements among relational:", relative_var("rq3_relational_replacements_projects", len(union_sets(replacements_among_relational['lines'])), total_replacements_projects_count))


Replacements among relational: 89 (30.1%)
Projects with replacements among relational: 44 (65.7%)


In [20]:
replacements_among_nonrelational = df[
    (df["X Model"] == "Non Relational") & (df["Y Model"] == "Non Relational")
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among non relational:", relative_var("rq3_nonrelational_replacements", len(replacements_among_nonrelational), total_replacements_count))
print("Projects with replacements among non relational:", relative_var("rq3_nonrelational_replacements_projects", len(union_sets(replacements_among_nonrelational['lines'])), total_replacements_projects_count))


Replacements among non relational: 66 (22.3%)
Projects with replacements among non relational: 21 (31.3%)


In [21]:
replacements_among_both = df[
    ((df["X Model"] == "Relational") & (df["Y Model"] == "Non Relational"))
    | ((df["X Model"] == "Non Relational") & (df["Y Model"] == "Relational"))
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among both:", relative_var("rq3_both_replacements", len(replacements_among_both), total_replacements_count))
print("Projects with replacements among both:", relative_var("rq3_both_replacements_projects", len(union_sets(replacements_among_both['lines'])), total_replacements_projects_count))


Replacements among both: 141 (47.6%)
Projects with replacements among both: 43 (64.2%)


In [22]:
replacements_among_same = df[
    ((df["X Model"] == "Relational") & (df["Y Model"] == "Relational"))
    | ((df["X Model"] == "Non Relational") & (df["Y Model"] == "Non Relational"))
].groupby(["X", "Y", "Source"])["LinesSet"].agg(lines=union_sets)
print("Replacements among same:", relative_var("rq3_same_replacements", len(replacements_among_same), total_replacements_count))
print("Projects with replacements among same:", relative_var("rq3_same_replacements_projects", len(union_sets(replacements_among_same['lines'])), total_replacements_projects_count))


Replacements among same: 155 (52.4%)
Projects with replacements among same: 56 (83.6%)


In [23]:
projects_with_only_replacements_of_the_same_model = union_sets(total_replacements['lines']) - union_sets(replacements_among_both['lines'])
print("Projects with replacements only among same model:", relative_var("rq3_only_same_replacements_projects", len(projects_with_only_replacements_of_the_same_model), total_replacements_projects_count))


Projects with replacements only among same model: 24 (35.8%)


In [24]:
nonrelational_only_replacements = projects_with_only_replacements_of_the_same_model & union_sets(replacements_among_nonrelational['lines'])
print("Projects with replacements only among non relational:", relative_var("rq3_non_relational_only_replacements_projects", len(nonrelational_only_replacements), len(projects_with_only_replacements_of_the_same_model)))


Projects with replacements only among non relational: 3 (12.5%)


In [25]:
assert df[df["Name"] == "Cassandra -> InPostgreSQL -> OutCassandra & PostgreSQL"]["Line Count"].to_list() == [3], "Paper: 'the replacements occur among distinct data models such as Cassandra replacing PostgreSQL in 3 projects'"

In [26]:
print("HyperSQL replacements:", var("rq3_repl_hypersql", dbms_was_replaced_count["HyperSQL"]))
print("HyperSQL replacements in projects:", var("rq3_repl_hypersql_projects", dbms_was_replaced_in_projects["HyperSQL"]))

HyperSQL replacements: 23
HyperSQL replacements in projects: 19


In [27]:
print("HyperSQL replacements in projects:", var("rq3_repl_hypersql_dbms", df[df["X"] == "HyperSQL"].groupby("X")["Y"].agg(other=set)["other"].apply(len).to_list()[0]))

HyperSQL replacements in projects: 18


In [28]:
print("HyperSQL replaced by Redis:", var("rq3_repl_hypersql_redis", df[(df["X"] == "HyperSQL") & (df["Y"] == "Redis")]["Line Count"].sum()))
print("HyperSQL replaced by Redis (ext):", var("rq3_repl_hypersql_redis_ext", num2words(df[(df["X"] == "HyperSQL") & (df["Y"] == "Redis")]["Line Count"].sum())))

HyperSQL replaced by Redis: 8
HyperSQL replaced by Redis (ext): eight


In [29]:
print("HyperSQL replaced by MySQL:", var("rq3_repl_hypersql_mysql", df[(df["X"] == "HyperSQL") & (df["Y"] == "MySQL")]["Line Count"].sum()))

HyperSQL replaced by MySQL: 6


In [30]:
print("HyperSQL replaced by PostgreSQL:", var("rq3_repl_hypersql_postgresql", df[(df["X"] == "HyperSQL") & (df["Y"] == "PostgreSQL")]["Line Count"].sum()))

HyperSQL replaced by PostgreSQL: 7


In [31]:
assert len(df[(df["X"] == "Redis") & (df["Y"] == "HyperSQL")]) == 0, "Paper: 'In the opposite direction, we did not find a pattern indicating HyperSQL replacing Redis'"

In [32]:
print("MySQL replacements:", var("rq3_repl_mysql", dbms_was_replaced_count["MySQL"]))
print("MySQL replacements in projects:", var("rq3_repl_mysql_projects", dbms_was_replaced_in_projects["MySQL"]))

MySQL replacements: 17
MySQL replacements in projects: 13


In [33]:
print("MySQL replaced by Redis:", var("rq3_repl_mysql_redis", num2words(df[(df["X"] == "MySQL") & (df["Y"] == "Redis")]["Line Count"].sum())))

MySQL replaced by Redis: four


In [34]:
assert len(df[(df["X"] == "Redis") & (df["Y"] == "MySQL")]) == 0, "Paper: 'The only MySQL replacement pattern that occurred frequently enough was its replacement by Redis (3 times), with no replacements in the opposite direction.'"

In [35]:
print("HBase replacements:", var("rq3_repl_hbase", dbms_was_replaced_count["HBase"]))
print("HBase replacements in projects:", var("rq3_repl_hbasel_projects", dbms_was_replaced_in_projects["HBase"]))

HBase replacements: 18
HBase replacements in projects: 5


In [36]:
print("HBase replaced by HyperSQL:", var("rq3_repl_hbase_hypersql", num2words(df[(df["X"] == "HBase") & (df["Y"] == "HyperSQL")]["Line Count"].sum())))

HBase replaced by HyperSQL: four


In [37]:
assert "Etcd" not in dbms_was_replaced_count, "Paper: 'On the other hand, Etcd was not replaced but substituted another DBMS'"

In [38]:
SEQUENTIAL_PROJECTS = SEQ_PATTERNS_DIR + os.sep + 'sequencial_project.txt'
with open(SEQUENTIAL_PROJECTS, "r") as f:
    projects = eval(f.read())
index = str(projects.index(('apache', 'camel')))

In [39]:
hbase_replacement_in_apache_camel = df[(df["X"] == "HBase") & (df["LinesSet"].apply(lambda x: index in x))]

In [40]:
assert len(hbase_replacement_in_apache_camel) == 14, "Paper: 'For example, in the Apache/Camel project, we discovered 14 distinct patterns of HBase substitutions for different DBMSs'"

In [41]:
result = list(hbase_replacement_in_apache_camel.groupby("Y")["LinesSet"].agg(lines=union_sets).index)
text = ', '.join(result[:-1]) + f", and {result[-1]}" 
print("DBMS with no removals:", var("rq3_apache_camel_hbase", text))

DBMS with no removals: ArangoDB, Cassandra, Couchbase, Etcd, H2, HyperSQL, Ignite-NoSql, Influx DB, MS SQL Server, Microsoft Azure CosmosDB, MySQL, PostGIS, PostgreSQL, and Redis


In [42]:
print("Redis replaces:", var("rq3_replby_redis", dbms_replaced_count["Redis"]))
print("Redis replaces in projects:", var("rq3_replby_redis_projects", dbms_replaced_in_projects["Redis"]))

Redis replaces: 21
Redis replaces in projects: 22


In [43]:
print("Oracle replaces:", var("rq3_replby_oracle", dbms_replaced_count["Oracle"]))
print("Oracle replaces in projects:", var("rq3_replby_oracle_projects", dbms_replaced_in_projects["Oracle"]))

Oracle replaces: 21
Oracle replaces in projects: 19


In [44]:
print("PostgreSQL replaces:", var("rq3_replby_postgresql", dbms_replaced_count["PostgreSQL"]))
print("PostgreSQL replaces in projects:", var("rq3_replby_postgres_projects", dbms_replaced_in_projects["PostgreSQL"]))

PostgreSQL replaces: 20
PostgreSQL replaces in projects: 24


In [45]:
print("HyperSQL replaces:", var("rq3_replby_hypersql", dbms_replaced_count["HyperSQL"]))
print("HyperSQL replaces in projects:", var("rq3_replby_hypersql_projects", dbms_replaced_in_projects["HyperSQL"]))

HyperSQL replaces: 14
HyperSQL replaces in projects: 11


In [46]:
print("MySQL replaces:", var("rq3_replby_mysql", dbms_replaced_count["MySQL"]))
print("MySQL replaces in projects:", var("rq3_replby_mysql_projects", dbms_replaced_in_projects["MySQL"]))

MySQL replaces: 14
MySQL replaces in projects: 18


In [47]:
print("HBase replaces:", var("rq3_replby_hbase", dbms_replaced_count["HBase"]))
print("HBase replaces in projects:", var("rq3_replby_hbase_projects", dbms_replaced_in_projects["HBase"]))

HBase replaces: 11
HBase replaces in projects: 6


In [48]:
print("Etcd replaces:", var("rq3_replby_etcd", dbms_replaced_count["Etcd"]))
print("Etcd replaces in projects:", var("rq3_replby_etcd_projects", dbms_replaced_in_projects["Etcd"]))

Etcd replaces: 11
Etcd replaces in projects: 3
