# RQ3 - Filter Sequential Patterns (SPMF)

Old code to filter sequential patterns using SPMF results

Uses:

- resources/seq_patterns/output_tam1.txt (from prepare/historical_seqpatterns_format.ipynb **manual**)
- resources/seq_patterns/output_tam3.txt (from prepare/historical_seqpatterns_format.ipynb **manual**)
- resources/seq_patterns/output_tam4_sid.txt (from prepare/historical_seqpatterns_format.ipynb **manual**)
- resources/seq_patterns/sequencial_project.txt (from prepare/historical_seqpatterns_format.ipynb)
- variable rq2_projects_with_dbms_history_total (from rq2/distribution.ipynb)





In [3]:
import sys  
sys.path.insert(1, '..')

In [4]:
import os
import functools

import pandas as pd
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from util import SEQ_PATTERNS_DIR, IMAGES_DIR
from analysis_helpers import var, relative_var, load_vars
from analysis_helpers import (
    RELATIONAL_ONLY_DBS, NONRELATIONAL_ONLY_DBS,
    MULTIMODEL_RELATIONAL, MULTIMODEL_NONRELATIONAL,
    RELATIONAL_DBS, NONRELATIONAL_DBS, MULTIMODEL_DBS
)

In [3]:
size_corpus = int(load_vars()['rq2_projects_with_dbms_history_total'])

In [4]:
#HISTORICAL_OUTPUT_SEQUENCIAL_IN_OUT = SEQ_PATTERNS_DIR + os.sep + 'output_init_tam4_SSID.txt' #add in util.py output_init_tam4_sid_2
HISTORICAL_OUTPUT_SEQUENCIAL_IN_OUT = SEQ_PATTERNS_DIR + os.sep + 'output_tam4_sid.txt' #'testes/output_init_tam4_sid_7.txt'
df = pd.read_csv(HISTORICAL_OUTPUT_SEQUENCIAL_IN_OUT, sep= '#', header=None) #não funcionou sep=" | , #",  engine = "python",
df

Unnamed: 0,0,1,2
0,INITNeo4j |,SUP: 4,SID: 72 124 164 204
1,INITNeo4j Neo4j |,SUP: 4,SID: 72 124 164 204
2,INITNeo4j Neo4j | OUTNeo4j |,SUP: 3,SID: 72 124 204
3,INITNeo4j Neo4j | Neo4j |,SUP: 3,SID: 72 124 164
4,INITNeo4j | OUTNeo4j |,SUP: 3,SID: 72 124 204
...,...,...,...
3146550,OUTH2 | INMS_SQL_Server | Redis | Oracle |,SUP: 3,SID: 44 144 168
3146551,OUTH2 | INMS_SQL_Server | Redis | MySQL |,SUP: 3,SID: 44 144 168
3146552,OUTH2 | INMS_SQL_Server | Redis | MS_SQL_Serve...,SUP: 3,SID: 44 144 168
3146553,OUTH2 | INMS_SQL_Server | Redis | PostgreSQL |,SUP: 3,SID: 44 144 168


In [5]:
#Rename the columns and removing the SUP: and SID: from the corresponding column values
df.columns = ['Pattern','SUP_ABCD','SID']
df.replace({'SUP:': ''}, regex=True, inplace=True) #or df['SUP'] = df['SUP'].str.replace('SUP:','')
df.replace({'SID:': ''}, regex=True, inplace=True) #ordf['SID'] = df['SID'].str.replace('SID:','')
df

Unnamed: 0,Pattern,SUP_ABCD,SID
0,INITNeo4j |,4,72 124 164 204
1,INITNeo4j Neo4j |,4,72 124 164 204
2,INITNeo4j Neo4j | OUTNeo4j |,3,72 124 204
3,INITNeo4j Neo4j | Neo4j |,3,72 124 164
4,INITNeo4j | OUTNeo4j |,3,72 124 204
...,...,...,...
3146550,OUTH2 | INMS_SQL_Server | Redis | Oracle |,3,44 144 168
3146551,OUTH2 | INMS_SQL_Server | Redis | MySQL |,3,44 144 168
3146552,OUTH2 | INMS_SQL_Server | Redis | MS_SQL_Serve...,3,44 144 168
3146553,OUTH2 | INMS_SQL_Server | Redis | PostgreSQL |,3,44 144 168


In [6]:
df2 = df.copy()

## Replacement Patterns

Find patterns that have both IN and OUT

In [7]:
select_pattern = df2[(df2['Pattern'].str.contains('IN') & df2['Pattern'].str.contains('OUT'))] #df2['Pattern'].str.contains('INIT') |
select_pattern.head()

Unnamed: 0,Pattern,SUP_ABCD,SID
2,INITNeo4j Neo4j | OUTNeo4j |,3,72 124 204
4,INITNeo4j | OUTNeo4j |,3,72 124 204
64,OUTSQLite OUTRedis | INRedis |,3,39 51 144
65,OUTSQLite OUTRedis | INRedis Redis |,3,39 51 144
67,OUTSQLite OUTRedis | MS_SQL_Server INRedis |,3,39 51 144


Explode Pattern column into A, AB, B, BC, C, CD, D, DE, where:
- A = First DBMS
- AB = Separator
- B = Second DBMS
- BC = Separator
- C = Third DBMS
- CD = Separator
- D - Fourht DBMS
- DE = Separator

The separators can start with:
- `/` = previous and next DBMSs are in the same slice
- `|` = previous and next DBMSs are in different slices
-- if the separator is `|#` or the second DBMS is None, it is the end of the rule


In [8]:
df3 = select_pattern.copy()
df3['Pattern'].replace({r"(?<=\w)\s+(?=\w)": '#/#'}, regex=True, inplace=True)
df3['Pattern'].replace({r"(?<=|)\s+(?=|)": '#'}, regex=True, inplace=True)
df3[['A','AB','B','BC','C','CD','D','DE']] = df3['Pattern'].str.split("[#]", expand=True, n=7)
df3

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
2,INITNeo4j#/#Neo4j#|#OUTNeo4j#|#,3,72 124 204,INITNeo4j,/,Neo4j,|,OUTNeo4j,|,,
4,INITNeo4j#|#OUTNeo4j#|#,3,72 124 204,INITNeo4j,|,OUTNeo4j,|,,,,
64,OUTSQLite#/#OUTRedis#|#INRedis#|#,3,39 51 144,OUTSQLite,/,OUTRedis,|,INRedis,|,,
65,OUTSQLite#/#OUTRedis#|#INRedis#/#Redis#|#,3,39 51 144,OUTSQLite,/,OUTRedis,|,INRedis,/,Redis,|#
67,OUTSQLite#/#OUTRedis#|#MS_SQL_Server#/#INRedis#|#,3,39 51 144,OUTSQLite,/,OUTRedis,|,MS_SQL_Server,/,INRedis,|#
...,...,...,...,...,...,...,...,...,...,...,...
3146550,OUTH2#|#INMS_SQL_Server#|#Redis#|#Oracle#|#,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,Redis,|,Oracle,|#
3146551,OUTH2#|#INMS_SQL_Server#|#Redis#|#MySQL#|#,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,Redis,|,MySQL,|#
3146552,OUTH2#|#INMS_SQL_Server#|#Redis#|#MS_SQL_Serve...,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,Redis,|,MS_SQL_Server,|#
3146553,OUTH2#|#INMS_SQL_Server#|#Redis#|#PostgreSQL#|#,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,Redis,|,PostgreSQL,|#


Find patterns:
- A | B / C | D
- A | B | C / D

In [9]:
df4 = df3[
    (df3['AB'].str.startswith('|') & df3['BC'].str.startswith('/') & df3['CD'].str.startswith('|'))
    | (df3['AB'].str.startswith('|') & df3['BC'].str.startswith('|') & df3['CD'].str.startswith('/'))]
df4 #

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
129,OUTSQLite#|#HyperSQL#/#MySQL#|#INRedis#|#,3,39 51 144,OUTSQLite,|,HyperSQL,/,MySQL,|,INRedis,|#
133,OUTSQLite#|#HyperSQL#/#INRedis#|#,3,39 51 144,OUTSQLite,|,HyperSQL,/,INRedis,|,,
135,OUTSQLite#|#HyperSQL#/#INRedis#|#MS_SQL_Server#|#,3,39 51 144,OUTSQLite,|,HyperSQL,/,INRedis,|,MS_SQL_Server,|#
136,OUTSQLite#|#HyperSQL#/#INRedis#|#HyperSQL#|#,3,39 51 144,OUTSQLite,|,HyperSQL,/,INRedis,|,HyperSQL,|#
137,OUTSQLite#|#HyperSQL#/#INRedis#|#Redis#|#,3,39 51 144,OUTSQLite,|,HyperSQL,/,INRedis,|,Redis,|#
...,...,...,...,...,...,...,...,...,...,...,...
3146523,OUTH2#|#INMS_SQL_Server#|#MySQL#/#PostgreSQL#|#,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,MySQL,/,PostgreSQL,|#
3146524,OUTH2#|#INMS_SQL_Server#|#MySQL#/#Redis#|#,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,MySQL,/,Redis,|#
3146532,OUTH2#|#INMS_SQL_Server#|#MS_SQL_Server#/#Post...,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,MS_SQL_Server,/,PostgreSQL,|#
3146533,OUTH2#|#INMS_SQL_Server#|#MS_SQL_Server#/#Redi...,3,44 144 168,OUTH2,|,INMS_SQL_Server,|,MS_SQL_Server,/,Redis,|#


Filtering the patterns (X-> INY OUTX -> Y)

In [10]:
select_pattern2 = df4[((df4['BC'] == '/') & (df4['CD'] == '|') & (df4['D'] != '') 
& (~df4['A'].str.startswith('IN') & ~df4['A'].str.startswith('OUT')) 
& (df4['B'].str.startswith('IN')) & (df4['C'].str.startswith('OUT')) 
& df4.apply(lambda x: str(x['A']) in str(x['C']),axis=1) 
& df4.apply(lambda x: str(x['D']) in str(x['B']),axis=1))] # & df4.apply(lambda x: str(x['A']) != str(x['B']),axis=1))
select_pattern2

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
1693062,PostgreSQL#|#INOracle#/#OUTPostgreSQL#|#Oracle#|#,3,13 23 76,PostgreSQL,|,INOracle,/,OUTPostgreSQL,|,Oracle,|#


Filtering the patterns (X-> OUTX INY -> Y)

In [11]:
select_pattern3 = df4[((df4['BC'] == '/') & (df4['CD'] == '|') & (df4['D'] != '' ) 
& (~df4['A'].str.startswith('IN') & ~df4['A'].str.startswith('OUT')) 
& (df4['B'].str.startswith('OUT')) & (df4['C'].str.startswith('IN')) 
& df4.apply(lambda x: str(x['A']) in str(x['B']),axis=1) 
& df4.apply(lambda x: str(x['D']) in str(x['C']),axis=1))] #& df4.apply(lambda x: str(x['B']) != str(x['C']),axis=1))
select_pattern3

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE


Filtering the patterns (X-> INY -> OUTX Y)

In [12]:
select_pattern4 = df4[((df4['BC'] == '|') & (df4['CD'] == '/') & (df4['D'] != '')
& (~df4['A'].str.startswith('IN') & ~df4['A'].str.startswith('OUT'))
& (df4['B'].str.startswith('IN') & (df4['C'].str.startswith('OUT')))
& df4.apply(lambda x: str(x['A']) in str(x['C']),axis=1)
& df4.apply(lambda x: str(x['D']) in str(x['B']),axis=1))] # & df3.apply(lambda x: str(x['A']) != str(x['B']),axis=1) 
select_pattern4

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
180395,HyperSQL#|#INPostgreSQL#|#OUTHyperSQL#/#Postgr...,5,0 86 100 109 148,HyperSQL,|,INPostgreSQL,|,OUTHyperSQL,/,PostgreSQL,|#
203070,HyperSQL#|#INRedis#|#OUTHyperSQL#/#Redis#|#,8,3 20 38 44 112 115 210 219,HyperSQL,|,INRedis,|,OUTHyperSQL,/,Redis,|#
231384,HyperSQL#|#INMySQL#|#OUTHyperSQL#/#MySQL#|#,5,48 86 100 148 188,HyperSQL,|,INMySQL,|,OUTHyperSQL,/,MySQL,|#
239703,HyperSQL#|#INMongoDB#|#OUTHyperSQL#/#MongoDB#|#,4,44 112 133 210,HyperSQL,|,INMongoDB,|,OUTHyperSQL,/,MongoDB,|#
245867,HyperSQL#|#INH2#|#OUTHyperSQL#/#H2#|#,3,100 188 210,HyperSQL,|,INH2,|,OUTHyperSQL,/,H2,|#
795513,Oracle#|#INMySQL#|#OUTOracle#/#MySQL#|#,3,23 52 231,Oracle,|,INMySQL,|,OUTOracle,/,MySQL,|#
935925,MySQL#|#INRedis#|#OUTMySQL#/#Redis#|#,3,57 136 212,MySQL,|,INRedis,|,OUTMySQL,/,Redis,|#
1755103,HBase#|#INRedis#|#OUTHBase#/#Redis#|#,3,44 68 151,HBase,|,INRedis,|,OUTHBase,/,Redis,|#


Filtering the patterns (X-> INY -> Y OUTX)

In [13]:
select_pattern5 = df4[((df3['BC'] == '|') & (df3['CD'] == '/') & (df3['D'] != '') 
& (~df3['A'].str.startswith('IN') & ~df3['A'].str.startswith('OUT')) 
& (df3['B'].str.startswith('IN') & df3['D'].str.startswith('OUT')) 
& df3.apply(lambda x: str(x['B']) != str(x['C']),axis=1) #endswith?
& df3.apply(lambda x: str(x['C']) in str(x['B']),axis=1)
& df3.apply(lambda x: str(x['A']) in str(x['D']),axis=1))] 
#select_pattern5['Pattern'] = select_pattern10['Pattern'].str.replace('#','')
select_pattern5

  


Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
440952,Hazelcast#|#INHyperSQL#|#HyperSQL#/#OUTHazelca...,3,0 109 219,Hazelcast,|,INHyperSQL,|,HyperSQL,/,OUTHazelcast,|#
1584378,SQLite#|#INMS_SQL_Server#|#MS_SQL_Server#/#OUT...,3,27 39 112,SQLite,|,INMS_SQL_Server,|,MS_SQL_Server,/,OUTSQLite,|#
1586699,SQLite#|#INH2#|#H2#/#OUTSQLite#|#,4,39 156 163 171,SQLite,|,INH2,|,H2,/,OUTSQLite,|#
1767968,HBase#|#INHyperSQL#|#HyperSQL#/#OUTHBase#|#,4,39 44 48 68,HBase,|,INHyperSQL,|,HyperSQL,/,OUTHBase,|#
1770664,HBase#|#INH2#|#H2#/#OUTHBase#|#,3,39 44 68,HBase,|,INH2,|,H2,/,OUTHBase,|#
1773561,HBase#|#INMS_SQL_Server#|#MS_SQL_Server#/#OUTH...,3,44 68 151,HBase,|,INMS_SQL_Server,|,MS_SQL_Server,/,OUTHBase,|#
1811054,Cassandra#|#INPostgreSQL#|#PostgreSQL#/#OUTCas...,3,5 47 59,Cassandra,|,INPostgreSQL,|,PostgreSQL,/,OUTCassandra,|#
1853119,Cassandra#|#INMS_SQL_Server#|#MS_SQL_Server#/#...,3,47 68 69,Cassandra,|,INMS_SQL_Server,|,MS_SQL_Server,/,OUTCassandra,|#
1860042,Couchbase#|#INPostgreSQL#|#PostgreSQL#/#OUTCou...,3,59 76 208,Couchbase,|,INPostgreSQL,|,PostgreSQL,/,OUTCouchbase,|#


Join all select patterns

In [14]:
df_concat = pd.concat([select_pattern2, select_pattern3, select_pattern4, select_pattern5]).drop_duplicates()
df_concat

Unnamed: 0,Pattern,SUP_ABCD,SID,A,AB,B,BC,C,CD,D,DE
1693062,PostgreSQL#|#INOracle#/#OUTPostgreSQL#|#Oracle#|#,3,13 23 76,PostgreSQL,|,INOracle,/,OUTPostgreSQL,|,Oracle,|#
180395,HyperSQL#|#INPostgreSQL#|#OUTHyperSQL#/#Postgr...,5,0 86 100 109 148,HyperSQL,|,INPostgreSQL,|,OUTHyperSQL,/,PostgreSQL,|#
203070,HyperSQL#|#INRedis#|#OUTHyperSQL#/#Redis#|#,8,3 20 38 44 112 115 210 219,HyperSQL,|,INRedis,|,OUTHyperSQL,/,Redis,|#
231384,HyperSQL#|#INMySQL#|#OUTHyperSQL#/#MySQL#|#,5,48 86 100 148 188,HyperSQL,|,INMySQL,|,OUTHyperSQL,/,MySQL,|#
239703,HyperSQL#|#INMongoDB#|#OUTHyperSQL#/#MongoDB#|#,4,44 112 133 210,HyperSQL,|,INMongoDB,|,OUTHyperSQL,/,MongoDB,|#
245867,HyperSQL#|#INH2#|#OUTHyperSQL#/#H2#|#,3,100 188 210,HyperSQL,|,INH2,|,OUTHyperSQL,/,H2,|#
795513,Oracle#|#INMySQL#|#OUTOracle#/#MySQL#|#,3,23 52 231,Oracle,|,INMySQL,|,OUTOracle,/,MySQL,|#
935925,MySQL#|#INRedis#|#OUTMySQL#/#Redis#|#,3,57 136 212,MySQL,|,INRedis,|,OUTMySQL,/,Redis,|#
1755103,HBase#|#INRedis#|#OUTHBase#/#Redis#|#,3,44 68 151,HBase,|,INRedis,|,OUTHBase,/,Redis,|#
440952,Hazelcast#|#INHyperSQL#|#HyperSQL#/#OUTHazelca...,3,0 109 219,Hazelcast,|,INHyperSQL,|,HyperSQL,/,OUTHazelcast,|#


In [15]:
len(df_concat)

18

Open patterns size 1 to research individual supports

In [16]:
#HISTORICAL_OUTPUT_SEQUENCIAL_IN_OUT_TAM1 = SEQ_PATTERNS_DIR + os.sep + 'output_seq_in_out.txt' #add in util.py
df5 = pd.read_csv(SEQ_PATTERNS_DIR + os.sep + 'output_tam1.txt', sep= '#', header=None) #'testes/output_init_tam1_7.txt'
#Rename the columns and removing the SUP: and SID: from the corresponding column values
df5.columns = ['Pattern_v1','SUP_v1']
df5.replace({'SUP:': ''}, regex=True, inplace=True) #or df['SUP'] = df['SUP'].str.replace('SUP:','')
df5['Pattern_v1'] = df5['Pattern_v1'].str.replace('[ | ]','')
df5

  


Unnamed: 0,Pattern_v1,SUP_v1
0,INITNeo4j,4
1,OUTSQLite,14
2,SapHana,11
3,Realm,4
4,INGoogleCloudFirestore,3
...,...,...
116,INITFirebird,4
117,OUTHBase,10
118,INArangoDB,5
119,INSAPAdaptiveServer,11


Inserting individual support

In [17]:
df8 = df_concat.copy()
df8.insert(4,"SUP_A",'0',False)#True)
df8.insert(7,"SUP_B",'0',False)#True)
df8.insert(10,"SUP_C",'0',False)#True)
df8.insert(13,"SUP_D",'0',False)#True)
df8

Unnamed: 0,Pattern,SUP_ABCD,SID,A,SUP_A,AB,B,SUP_B,BC,C,SUP_C,CD,D,SUP_D,DE
1693062,PostgreSQL#|#INOracle#/#OUTPostgreSQL#|#Oracle#|#,3,13 23 76,PostgreSQL,0,|,INOracle,0,/,OUTPostgreSQL,0,|,Oracle,0,|#
180395,HyperSQL#|#INPostgreSQL#|#OUTHyperSQL#/#Postgr...,5,0 86 100 109 148,HyperSQL,0,|,INPostgreSQL,0,|,OUTHyperSQL,0,/,PostgreSQL,0,|#
203070,HyperSQL#|#INRedis#|#OUTHyperSQL#/#Redis#|#,8,3 20 38 44 112 115 210 219,HyperSQL,0,|,INRedis,0,|,OUTHyperSQL,0,/,Redis,0,|#
231384,HyperSQL#|#INMySQL#|#OUTHyperSQL#/#MySQL#|#,5,48 86 100 148 188,HyperSQL,0,|,INMySQL,0,|,OUTHyperSQL,0,/,MySQL,0,|#
239703,HyperSQL#|#INMongoDB#|#OUTHyperSQL#/#MongoDB#|#,4,44 112 133 210,HyperSQL,0,|,INMongoDB,0,|,OUTHyperSQL,0,/,MongoDB,0,|#
245867,HyperSQL#|#INH2#|#OUTHyperSQL#/#H2#|#,3,100 188 210,HyperSQL,0,|,INH2,0,|,OUTHyperSQL,0,/,H2,0,|#
795513,Oracle#|#INMySQL#|#OUTOracle#/#MySQL#|#,3,23 52 231,Oracle,0,|,INMySQL,0,|,OUTOracle,0,/,MySQL,0,|#
935925,MySQL#|#INRedis#|#OUTMySQL#/#Redis#|#,3,57 136 212,MySQL,0,|,INRedis,0,|,OUTMySQL,0,/,Redis,0,|#
1755103,HBase#|#INRedis#|#OUTHBase#/#Redis#|#,3,44 68 151,HBase,0,|,INRedis,0,|,OUTHBase,0,/,Redis,0,|#
440952,Hazelcast#|#INHyperSQL#|#HyperSQL#/#OUTHazelca...,3,0 109 219,Hazelcast,0,|,INHyperSQL,0,|,HyperSQL,0,/,OUTHazelcast,0,|#


Researching and writing the individual support in the collumns A,B,C and D

In [18]:
for index, row in df8.iterrows():
    for j, rowj in df5.iterrows():
        if row['A'] == rowj['Pattern_v1']:
            df8.loc[index,'SUP_A'] =  df8.loc[index,'SUP_A'].replace(df8.loc[index,'SUP_A'],df5.loc[j,'SUP_v1'])
            #print(row['DB_1'],':', rowj['Pattern_v1'])
        if row['B'] == rowj['Pattern_v1']:
         #   print(row['Pattern_v2'], rowj['Pattern_v1'])
            df8.loc[index,'SUP_B'] =  df8.loc[index,'SUP_B'].replace(df8.loc[index,'SUP_B'],df5.loc[j,'SUP_v1'])
        #print(row['Pattern_v3'],':')
        if row['C'] != ' ' or row['C'] != '':
            #print(row['Pattern_v3'],':', rowj['Pattern_v1'])
            if row['C'] == rowj['Pattern_v1']:
                #print(row['DB_3'],'!', rowj['Pattern_v1'])
                df8.loc[index,'SUP_C'] =  df8.loc[index,'SUP_C'].replace(df8.loc[index,'SUP_C'],df5.loc[j,'SUP_v1'])
        #elif row['DB_3'].isnull():
         #   df8.loc[index,'SUP_DB3'] =  df8.loc[index,'SUP_DB3'].replace(df8.loc[index,'SUP_DB3'], '0')
          #  print('vazio')
        if (row['D'] is not None) or (row['D'] != ' ') or (row['D'] != '') :
            if row['D']  == rowj['Pattern_v1']:
                #print(row['Pattern_v3'],'!', rowj['Pattern_v1'])
                df8.loc[index,'SUP_D'] =  df8.loc[index,'SUP_D'].replace(df8.loc[index,'SUP_D'],df5.loc[j,'SUP_v1'])

In [19]:
df8

Unnamed: 0,Pattern,SUP_ABCD,SID,A,SUP_A,AB,B,SUP_B,BC,C,SUP_C,CD,D,SUP_D,DE
1693062,PostgreSQL#|#INOracle#/#OUTPostgreSQL#|#Oracle#|#,3,13 23 76,PostgreSQL,102,|,INOracle,76,/,OUTPostgreSQL,25,|,Oracle,94,|#
180395,HyperSQL#|#INPostgreSQL#|#OUTHyperSQL#/#Postgr...,5,0 86 100 109 148,HyperSQL,81,|,INPostgreSQL,84,|,OUTHyperSQL,33,/,PostgreSQL,102,|#
203070,HyperSQL#|#INRedis#|#OUTHyperSQL#/#Redis#|#,8,3 20 38 44 112 115 210 219,HyperSQL,81,|,INRedis,83,|,OUTHyperSQL,33,/,Redis,90,|#
231384,HyperSQL#|#INMySQL#|#OUTHyperSQL#/#MySQL#|#,5,48 86 100 148 188,HyperSQL,81,|,INMySQL,89,|,OUTHyperSQL,33,/,MySQL,137,|#
239703,HyperSQL#|#INMongoDB#|#OUTHyperSQL#/#MongoDB#|#,4,44 112 133 210,HyperSQL,81,|,INMongoDB,43,|,OUTHyperSQL,33,/,MongoDB,51,|#
245867,HyperSQL#|#INH2#|#OUTHyperSQL#/#H2#|#,3,100 188 210,HyperSQL,81,|,INH2,88,|,OUTHyperSQL,33,/,H2,114,|#
795513,Oracle#|#INMySQL#|#OUTOracle#/#MySQL#|#,3,23 52 231,Oracle,94,|,INMySQL,89,|,OUTOracle,23,/,MySQL,137,|#
935925,MySQL#|#INRedis#|#OUTMySQL#/#Redis#|#,3,57 136 212,MySQL,137,|,INRedis,83,|,OUTMySQL,41,/,Redis,90,|#
1755103,HBase#|#INRedis#|#OUTHBase#/#Redis#|#,3,44 68 151,HBase,24,|,INRedis,83,|,OUTHBase,10,/,Redis,90,|#
440952,Hazelcast#|#INHyperSQL#|#HyperSQL#/#OUTHazelca...,3,0 109 219,Hazelcast,34,|,INHyperSQL,52,|,HyperSQL,81,/,OUTHazelcast,17,|#


In [20]:
#df3['Pattern'].replace({r"|": '#|#'}, regex=True, inplace=True)
df8['Pattern'].replace({'#': ''}, regex=True, inplace=True)
df8

Unnamed: 0,Pattern,SUP_ABCD,SID,A,SUP_A,AB,B,SUP_B,BC,C,SUP_C,CD,D,SUP_D,DE
1693062,PostgreSQL|INOracle/OUTPostgreSQL|Oracle|,3,13 23 76,PostgreSQL,102,|,INOracle,76,/,OUTPostgreSQL,25,|,Oracle,94,|#
180395,HyperSQL|INPostgreSQL|OUTHyperSQL/PostgreSQL|,5,0 86 100 109 148,HyperSQL,81,|,INPostgreSQL,84,|,OUTHyperSQL,33,/,PostgreSQL,102,|#
203070,HyperSQL|INRedis|OUTHyperSQL/Redis|,8,3 20 38 44 112 115 210 219,HyperSQL,81,|,INRedis,83,|,OUTHyperSQL,33,/,Redis,90,|#
231384,HyperSQL|INMySQL|OUTHyperSQL/MySQL|,5,48 86 100 148 188,HyperSQL,81,|,INMySQL,89,|,OUTHyperSQL,33,/,MySQL,137,|#
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,HyperSQL,81,|,INMongoDB,43,|,OUTHyperSQL,33,/,MongoDB,51,|#
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,HyperSQL,81,|,INH2,88,|,OUTHyperSQL,33,/,H2,114,|#
795513,Oracle|INMySQL|OUTOracle/MySQL|,3,23 52 231,Oracle,94,|,INMySQL,89,|,OUTOracle,23,/,MySQL,137,|#
935925,MySQL|INRedis|OUTMySQL/Redis|,3,57 136 212,MySQL,137,|,INRedis,83,|,OUTMySQL,41,/,Redis,90,|#
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,HBase,24,|,INRedis,83,|,OUTHBase,10,/,Redis,90,|#
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,Hazelcast,34,|,INHyperSQL,52,|,HyperSQL,81,/,OUTHazelcast,17,|#


In [21]:
#Generating excel with all INIT patterns
#df8.to_excel(SEQ_PATTERNS_DIR + os.sep +"testes/pattern_supABCD_general_selection10.xlsx",index=True)

Open patterns size 3 to research combination supports

In [22]:
#Primeiro tem que gerar saida com INIT tam 3
#HISTORICAL_OUTPUT_SEQUENCIAL_IN_OUT_TAM1 = SEQ_PATTERNS_DIR + os.sep + 'output_seq_in_out.txt' #add in util.py
df10 = pd.read_csv(SEQ_PATTERNS_DIR + os.sep + 'output_tam3.txt', sep= '#', header=None) #'testes/output_init_tam3_7.txt'
#Rename the columns and removing the SUP: and SID: from the corresponding column values
df10.columns = ['Pattern_v2','SUP_v2']
df10.replace({'SUP:': ''}, regex=True, inplace=True) #or df['SUP'] = df['SUP'].str.replace('SUP:','')
df10

Unnamed: 0,Pattern_v2,SUP_v2
0,INITNeo4j |,4
1,INITNeo4j Neo4j |,4
2,INITNeo4j Neo4j | OUTNeo4j |,3
3,INITNeo4j Neo4j | Neo4j |,3
4,INITNeo4j | OUTNeo4j |,3
...,...,...
150576,OUTH2 | INMS_SQL_Server | Oracle |,3
150577,OUTH2 | INMS_SQL_Server | MySQL |,3
150578,OUTH2 | INMS_SQL_Server | MS_SQL_Server |,3
150579,OUTH2 | INMS_SQL_Server | PostgreSQL |,3


Formating dataframe with support size 3

In [23]:
df10['Pattern_v2'].replace({r"(?<=\w)\s+(?=\w)": '/'}, regex=True, inplace=True)
df10['Pattern_v2'] = df10['Pattern_v2'].str.replace(' ','')
df10

Unnamed: 0,Pattern_v2,SUP_v2
0,INITNeo4j|,4
1,INITNeo4j/Neo4j|,4
2,INITNeo4j/Neo4j|OUTNeo4j|,3
3,INITNeo4j/Neo4j|Neo4j|,3
4,INITNeo4j|OUTNeo4j|,3
...,...,...
150576,OUTH2|INMS_SQL_Server|Oracle|,3
150577,OUTH2|INMS_SQL_Server|MySQL|,3
150578,OUTH2|INMS_SQL_Server|MS_SQL_Server|,3
150579,OUTH2|INMS_SQL_Server|PostgreSQL|,3


In [24]:
df10 =  df10[(df10['Pattern_v2'].str.startswith('IN') & (~df10['Pattern_v2'].str.startswith('INIT'))) | (df10['Pattern_v2'].str.contains('OUT'))] 
df10

Unnamed: 0,Pattern_v2,SUP_v2
2,INITNeo4j/Neo4j|OUTNeo4j|,3
4,INITNeo4j|OUTNeo4j|,3
6,OUTSQLite|,14
7,OUTSQLite/PostgreSQL|,5
8,OUTSQLite/PostgreSQL|H2|,4
...,...,...
150576,OUTH2|INMS_SQL_Server|Oracle|,3
150577,OUTH2|INMS_SQL_Server|MySQL|,3
150578,OUTH2|INMS_SQL_Server|MS_SQL_Server|,3
150579,OUTH2|INMS_SQL_Server|PostgreSQL|,3


In [25]:
df13 = df8.copy()
#Inserting combination support collumns
df13.insert(3,'SUP_BC','0',False)
df13.insert(4,'SUP_BCD','0',False)

#Removing blank spaces
df13['Pattern'] = df13['Pattern'].str.replace(' ','')

##Removing blank spaces in collumn C
df13['D'] = df13['D'].str.replace(' ','')

#Converting to string because replace
df13['SUP_BC'] = df13['SUP_BC'].astype(str)
df13['SUP_BCD'] = df13['SUP_BCD'].astype(str)

#To resolve "can only concatenate str (not "NoneType") to str" error after suports iteration 
#df13['D'].replace({'None': ''}, regex=True, inplace=True)
#df13['C'].replace({'None': ''}, regex=True, inplace=True)
#df['COL1'] = ['COL1'].fillna(' ')
#df13
df13

Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,AB,B,SUP_B,BC,C,SUP_C,CD,D,SUP_D,DE
1693062,PostgreSQL|INOracle/OUTPostgreSQL|Oracle|,3,13 23 76,0,0,PostgreSQL,102,|,INOracle,76,/,OUTPostgreSQL,25,|,Oracle,94,|#
180395,HyperSQL|INPostgreSQL|OUTHyperSQL/PostgreSQL|,5,0 86 100 109 148,0,0,HyperSQL,81,|,INPostgreSQL,84,|,OUTHyperSQL,33,/,PostgreSQL,102,|#
203070,HyperSQL|INRedis|OUTHyperSQL/Redis|,8,3 20 38 44 112 115 210 219,0,0,HyperSQL,81,|,INRedis,83,|,OUTHyperSQL,33,/,Redis,90,|#
231384,HyperSQL|INMySQL|OUTHyperSQL/MySQL|,5,48 86 100 148 188,0,0,HyperSQL,81,|,INMySQL,89,|,OUTHyperSQL,33,/,MySQL,137,|#
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,0,0,HyperSQL,81,|,INMongoDB,43,|,OUTHyperSQL,33,/,MongoDB,51,|#
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,0,0,HyperSQL,81,|,INH2,88,|,OUTHyperSQL,33,/,H2,114,|#
795513,Oracle|INMySQL|OUTOracle/MySQL|,3,23 52 231,0,0,Oracle,94,|,INMySQL,89,|,OUTOracle,23,/,MySQL,137,|#
935925,MySQL|INRedis|OUTMySQL/Redis|,3,57 136 212,0,0,MySQL,137,|,INRedis,83,|,OUTMySQL,41,/,Redis,90,|#
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,0,0,HBase,24,|,INRedis,83,|,OUTHBase,10,/,Redis,90,|#
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,0,0,Hazelcast,34,|,INHyperSQL,52,|,HyperSQL,81,/,OUTHazelcast,17,|#


Researching and writing the combinations support in the collumns SUP_BC and SUP_BCD

In [26]:
#Ajuste lift 2
for index, row in df13.iterrows():   
    for j, rowj in df10.iterrows():
        #if (row['C'] is None) or (row['C'] == ''):#(row['C'] is NoneType):
            #print('entrou1', row['A'], row['B'], row['C'], row['D'])
        if (row['C'] is not None) or (row['C'] != ' ') or (row['C'] != ''): #and ((row['DB_4'] is None) or (row['DB_4'] == ' ') or (row['DB_4'] == '')):
            #print('entrou1')
            #print('entrou1', row['A'], row['B'], row['C'], row['D'])
            if (row['B']+'|'+row['C'])+'|' == rowj['Pattern_v2'] and (row['B']+'|'+row['C']+'|' in row['Pattern']) :
            #df9.loc[index,'LiftA->BC'] = (df9.loc[index,'SUP']/200)/((df9.loc[index,'SUP_DB1']/200)*((df10.loc[,'SUP_v2']/200)
                df13.loc[index,'SUP_BC'] = df13.loc[index,'SUP_BC'].replace(df13.loc[index,'SUP_BC'], df10.loc[j,'SUP_v2'])
            if (row['B']+'/'+row['C']+'|' == rowj['Pattern_v2']) and (row['B']+'/'+row['C']+'|' in row['Pattern']):
                #row['SUP_BC'] = rowj['SUP_v2']
                df13.loc[index,'SUP_BC'] = df13.loc[index,'SUP_BC'].replace(df13.loc[index,'SUP_BC'], df10.loc[j,'SUP_v2'])
            if (row['B']+'/'+row['C']+'|' == rowj['Pattern_v2']) and (row['B']+'/'+row['C']+'/' in row['Pattern']):
                #row['SUP_BC'] = rowj['SUP_v2']
                #print('entrou1', row['A'], row['B'], row['C'], row['D'])
                df13.loc[index,'SUP_BC'] = df13.loc[index,'SUP_BC'].replace(df13.loc[index,'SUP_BC'], df10.loc[j,'SUP_v2'])  
            if (row['B']+'|'+row['C']+'|' == rowj['Pattern_v2']) and (row['B']+'|'+row['C']+'/' in row['Pattern']):
                #row['SUP_BC'] = rowj['SUP_v2']
                #print('entrou1', row['A'], row['B'], row['C'], row['D'])
                df13.loc[index,'SUP_BC'] = df13.loc[index,'SUP_BC'].replace(df13.loc[index,'SUP_BC'], df10.loc[j,'SUP_v2'])  
        if (row['D'] is not None) or (row['D'] != ' ') or (row['D'] != ''):
            #print('entrou2')
            if (row['B']+'|'+row['C']+'|'+row['D']+'|' == rowj['Pattern_v2']) and (row['B']+'|'+row['C']+'|'+row['D']+'|' in row['Pattern']):
                df13.loc[index,'SUP_BCD'] = df13.loc[index,'SUP_BCD'].replace(df13.loc[index,'SUP_BCD'], df10.loc[j,'SUP_v2'])
            if (row['B']+'/'+row['C']+'|'+row['D']+'|' == rowj['Pattern_v2']) and (row['B']+'/'+row['C']+'|'+row['D']+'|' in row['Pattern']):
                #row['SUP_BC'] = rowj['SUP_v2']
                df13.loc[index,'SUP_BCD'] = df13.loc[index,'SUP_BCD'].replace(df13.loc[index,'SUP_BCD'], df10.loc[j,'SUP_v2'])
            if (row['B']+'|'+row['C']+'/'+row['D']+'|' == rowj['Pattern_v2']) and (row['B']+'|'+row['C']+'/'+row['D']+'|' in row['Pattern']):
                df13.loc[index,'SUP_BCD'] = df13.loc[index,'SUP_BCD'].replace(df13.loc[index,'SUP_BCD'], df10.loc[j,'SUP_v2'])
            if (row['B']+'/'+row['C']+'/'+row['D']+'|' == rowj['Pattern_v2']) and (row['B']+'/'+row['C']+'/'+row['D']+'|' in row['Pattern']):
                #print('entrou1', row['A'], row['B'], row['C'], row['D'])
                df13.loc[index,'SUP_BCD'] = df13.loc[index,'SUP_BCD'].replace(df13.loc[index,'SUP_BCD'], df10.loc[j,'SUP_v2'])

In [27]:
df13

Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,AB,B,SUP_B,BC,C,SUP_C,CD,D,SUP_D,DE
1693062,PostgreSQL|INOracle/OUTPostgreSQL|Oracle|,3,13 23 76,3,3,PostgreSQL,102,|,INOracle,76,/,OUTPostgreSQL,25,|,Oracle,94,|#
180395,HyperSQL|INPostgreSQL|OUTHyperSQL/PostgreSQL|,5,0 86 100 109 148,13,11,HyperSQL,81,|,INPostgreSQL,84,|,OUTHyperSQL,33,/,PostgreSQL,102,|#
203070,HyperSQL|INRedis|OUTHyperSQL/Redis|,8,3 20 38 44 112 115 210 219,12,11,HyperSQL,81,|,INRedis,83,|,OUTHyperSQL,33,/,Redis,90,|#
231384,HyperSQL|INMySQL|OUTHyperSQL/MySQL|,5,48 86 100 148 188,11,8,HyperSQL,81,|,INMySQL,89,|,OUTHyperSQL,33,/,MySQL,137,|#
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,7,6,HyperSQL,81,|,INMongoDB,43,|,OUTHyperSQL,33,/,MongoDB,51,|#
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,10,5,HyperSQL,81,|,INH2,88,|,OUTHyperSQL,33,/,H2,114,|#
795513,Oracle|INMySQL|OUTOracle/MySQL|,3,23 52 231,10,8,Oracle,94,|,INMySQL,89,|,OUTOracle,23,/,MySQL,137,|#
935925,MySQL|INRedis|OUTMySQL/Redis|,3,57 136 212,9,5,MySQL,137,|,INRedis,83,|,OUTMySQL,41,/,Redis,90,|#
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,5,4,HBase,24,|,INRedis,83,|,OUTHBase,10,/,Redis,90,|#
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,51,3,Hazelcast,34,|,INHyperSQL,52,|,HyperSQL,81,/,OUTHazelcast,17,|#


Calculateding confidence and lift to each pattern generated.

In [28]:
#Converting to integer
df13['SUP_A'] = df13['SUP_A'].astype(int)
df13['SUP_B'] = df13['SUP_B'].astype(int)
df13['SUP_C'] = df13['SUP_C'].astype(int)
df13['SUP_D'] = df13['SUP_D'].astype(int)
df13['SUP_BC'] = df13['SUP_BC'].astype(int)
df13['SUP_BCD'] = df13['SUP_BCD'].astype(int)
df13['SUP_ABCD'] = df13['SUP_ABCD'].astype(int)

#Inserting collums to confidence measure
df13.insert(7,'ConfA->_BCD','0',False) 
df13.insert(11,'ConfB->A_CD','0',False)
df13.insert(15,'ConfC->AB_D','0',False)
df13.insert(19,'ConfD->ABC_','0',False)

#Converting to integer
df13['ConfA->_BCD'] = df13['ConfA->_BCD'].astype(int)
df13['ConfB->A_CD'] = df13['ConfB->A_CD'].astype(int)
df13['ConfC->AB_D'] = df13['ConfC->AB_D'].astype(int)
df13['ConfD->ABC_'] = df13['ConfD->ABC_'].astype(int)
#df13['SUP_ABCD'] = df13['SUP_ABCD'].astype(int)

df13['ConfA->_BCD'] = (df13['SUP_ABCD'])/df13['SUP_A']
df13['ConfB->A_CD'] = (df13['SUP_ABCD'])/df13['SUP_B']
df13['ConfC->AB_D'] = (df13['SUP_ABCD'])/df13['SUP_C']
df13['ConfD->ABC_'] = (df13['SUP_ABCD'])/df13['SUP_D']
df13


Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,ConfA->_BCD,AB,B,...,ConfB->A_CD,BC,C,SUP_C,ConfC->AB_D,CD,D,SUP_D,ConfD->ABC_,DE
1693062,PostgreSQL|INOracle/OUTPostgreSQL|Oracle|,3,13 23 76,3,3,PostgreSQL,102,0.029412,|,INOracle,...,0.039474,/,OUTPostgreSQL,25,0.12,|,Oracle,94,0.031915,|#
180395,HyperSQL|INPostgreSQL|OUTHyperSQL/PostgreSQL|,5,0 86 100 109 148,13,11,HyperSQL,81,0.061728,|,INPostgreSQL,...,0.059524,|,OUTHyperSQL,33,0.151515,/,PostgreSQL,102,0.04902,|#
203070,HyperSQL|INRedis|OUTHyperSQL/Redis|,8,3 20 38 44 112 115 210 219,12,11,HyperSQL,81,0.098765,|,INRedis,...,0.096386,|,OUTHyperSQL,33,0.242424,/,Redis,90,0.088889,|#
231384,HyperSQL|INMySQL|OUTHyperSQL/MySQL|,5,48 86 100 148 188,11,8,HyperSQL,81,0.061728,|,INMySQL,...,0.05618,|,OUTHyperSQL,33,0.151515,/,MySQL,137,0.036496,|#
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,7,6,HyperSQL,81,0.049383,|,INMongoDB,...,0.093023,|,OUTHyperSQL,33,0.121212,/,MongoDB,51,0.078431,|#
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,10,5,HyperSQL,81,0.037037,|,INH2,...,0.034091,|,OUTHyperSQL,33,0.090909,/,H2,114,0.026316,|#
795513,Oracle|INMySQL|OUTOracle/MySQL|,3,23 52 231,10,8,Oracle,94,0.031915,|,INMySQL,...,0.033708,|,OUTOracle,23,0.130435,/,MySQL,137,0.021898,|#
935925,MySQL|INRedis|OUTMySQL/Redis|,3,57 136 212,9,5,MySQL,137,0.021898,|,INRedis,...,0.036145,|,OUTMySQL,41,0.073171,/,Redis,90,0.033333,|#
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,5,4,HBase,24,0.125,|,INRedis,...,0.036145,|,OUTHBase,10,0.3,/,Redis,90,0.033333,|#
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,51,3,Hazelcast,34,0.088235,|,INHyperSQL,...,0.057692,|,HyperSQL,81,0.037037,/,OUTHazelcast,17,0.176471,|#


In [29]:
df13['LiftA->B'] = (df13['SUP_ABCD']/size_corpus)/((df13['SUP_A']/size_corpus)*(df13['SUP_B']/size_corpus))
df13['LiftA->BC'] = (df13['SUP_ABCD']/size_corpus)/((df13['SUP_A']/size_corpus)*(df13['SUP_BC']/size_corpus))
df13['LiftA->BCD'] = (df13['SUP_ABCD']/size_corpus)/((df13['SUP_A']/size_corpus)*(df13['SUP_BCD']/size_corpus))
df13

Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,ConfA->_BCD,AB,B,...,SUP_C,ConfC->AB_D,CD,D,SUP_D,ConfD->ABC_,DE,LiftA->B,LiftA->BC,LiftA->BCD
1693062,PostgreSQL|INOracle/OUTPostgreSQL|Oracle|,3,13 23 76,3,3,PostgreSQL,102,0.029412,|,INOracle,...,25,0.12,|,Oracle,94,0.031915,|#,0.090557,2.294118,2.294118
180395,HyperSQL|INPostgreSQL|OUTHyperSQL/PostgreSQL|,5,0 86 100 109 148,13,11,HyperSQL,81,0.061728,|,INPostgreSQL,...,33,0.151515,/,PostgreSQL,102,0.04902,|#,0.171958,1.111111,1.313131
203070,HyperSQL|INRedis|OUTHyperSQL/Redis|,8,3 20 38 44 112 115 210 219,12,11,HyperSQL,81,0.098765,|,INRedis,...,33,0.242424,/,Redis,90,0.088889,|#,0.278447,1.925926,2.10101
231384,HyperSQL|INMySQL|OUTHyperSQL/MySQL|,5,48 86 100 148 188,11,8,HyperSQL,81,0.061728,|,INMySQL,...,33,0.151515,/,MySQL,137,0.036496,|#,0.162297,1.313131,1.805556
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,7,6,HyperSQL,81,0.049383,|,INMongoDB,...,33,0.121212,/,MongoDB,51,0.078431,|#,0.268734,1.650794,1.925926
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,10,5,HyperSQL,81,0.037037,|,INH2,...,33,0.090909,/,H2,114,0.026316,|#,0.098485,0.866667,1.733333
795513,Oracle|INMySQL|OUTOracle/MySQL|,3,23 52 231,10,8,Oracle,94,0.031915,|,INMySQL,...,23,0.130435,/,MySQL,137,0.021898,|#,0.083911,0.746809,0.933511
935925,MySQL|INRedis|OUTMySQL/Redis|,3,57 136 212,9,5,MySQL,137,0.021898,|,INRedis,...,41,0.073171,/,Redis,90,0.033333,|#,0.061736,0.569343,1.024818
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,5,4,HBase,24,0.125,|,INRedis,...,10,0.3,/,Redis,90,0.033333,|#,0.35241,5.85,7.3125
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,51,3,Hazelcast,34,0.088235,|,INHyperSQL,...,81,0.037037,/,OUTHazelcast,17,0.176471,|#,0.397059,0.404844,6.882353


In [30]:
#df13.to_excel(SEQ_PATTERNS_DIR + os.sep +"testes/pattern_supABCD_initin_selection_measure10.xlsx",index=True) #"testes/pattern_supABCD_initin_selection_measure2.xlsx
df13 = df13.sort_values(by='A')
df13
#df13.to_excel(SEQ_PATTERNS_DIR + os.sep +'pattern_selection_measures.xlsx',index=True)
#A tabela gerada aqui é a tabelas de número 9 do artigo

Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,ConfA->_BCD,AB,B,...,SUP_C,ConfC->AB_D,CD,D,SUP_D,ConfD->ABC_,DE,LiftA->B,LiftA->BC,LiftA->BCD
1811054,Cassandra|INPostgreSQL|PostgreSQL/OUTCassandra|,3,5 47 59,83,5,Cassandra,31,0.096774,|,INPostgreSQL,...,102,0.029412,/,OUTCassandra,9,0.333333,|#,0.269585,0.272833,4.529032
1853119,Cassandra|INMS_SQL_Server|MS_SQL_Server/OUTCas...,3,47 68 69,58,4,Cassandra,31,0.096774,|,INMS_SQL_Server,...,67,0.044776,/,OUTCassandra,9,0.333333,|#,0.377419,0.390434,5.66129
1860042,Couchbase|INPostgreSQL|PostgreSQL/OUTCouchbase|,3,59 76 208,83,3,Couchbase,9,0.333333,|,INPostgreSQL,...,102,0.029412,/,OUTCouchbase,4,0.75,|#,0.928571,0.939759,26.0
1773561,HBase|INMS_SQL_Server|MS_SQL_Server/OUTHBase|,3,44 68 151,58,5,HBase,24,0.125,|,INMS_SQL_Server,...,67,0.044776,/,OUTHBase,10,0.3,|#,0.4875,0.50431,5.85
1770664,HBase|INH2|H2/OUTHBase|,3,39 44 68,86,4,HBase,24,0.125,|,INH2,...,114,0.026316,/,OUTHBase,10,0.3,|#,0.332386,0.340116,7.3125
1767968,HBase|INHyperSQL|HyperSQL/OUTHBase|,4,39 44 48 68,51,4,HBase,24,0.166667,|,INHyperSQL,...,81,0.049383,/,OUTHBase,10,0.4,|#,0.75,0.764706,9.75
1755103,HBase|INRedis|OUTHBase/Redis|,3,44 68 151,5,4,HBase,24,0.125,|,INRedis,...,10,0.3,/,Redis,90,0.033333,|#,0.35241,5.85,7.3125
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,0 109 219,51,3,Hazelcast,34,0.088235,|,INHyperSQL,...,81,0.037037,/,OUTHazelcast,17,0.176471,|#,0.397059,0.404844,6.882353
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,100 188 210,10,5,HyperSQL,81,0.037037,|,INH2,...,33,0.090909,/,H2,114,0.026316,|#,0.098485,0.866667,1.733333
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,44 112 133 210,7,6,HyperSQL,81,0.049383,|,INMongoDB,...,33,0.121212,/,MongoDB,51,0.078431,|#,0.268734,1.650794,1.925926


In [31]:
df13['A'].value_counts()

HyperSQL      5
HBase         4
Cassandra     2
SQLite        2
Couchbase     1
Hazelcast     1
MySQL         1
Oracle        1
PostgreSQL    1
Name: A, dtype: int64

In [32]:
df13['B'].value_counts()
#filtrar por só in

INPostgreSQL       3
INMS_SQL_Server    3
INH2               3
INRedis            3
INHyperSQL         2
INMySQL            2
INMongoDB          1
INOracle           1
Name: B, dtype: int64

In [34]:
df = df13.copy()
SEQUENTIAL_PROJECTS = SEQ_PATTERNS_DIR + os.sep + 'sequencial_project.txt'
with open(SEQUENTIAL_PROJECTS, "r") as f:
    nomes_list = eval(f.read())


#print(nomes_list)
new_sid=[]

for index, row in df.iterrows():
    list_projects_sid = row['SID'].split(' ')
    projectsNames=""
    for project in list_projects_sid:
        if(project.strip() != ''):
            if(projectsNames == ''):
                projectsNames = nomes_list[int(project)]
            else:
                projectsNames = str(projectsNames) + " , " + str(nomes_list[int(project)])
    new_sid.append(projectsNames)

#print(new_sid)

df['SID'] = new_sid
#df.to_excel(SEQ_PATTERNS_DIR + os.sep +'pattern_selection_measures_names.xlsx',index=False)        
    
#tem SID =0
df   


Unnamed: 0,Pattern,SUP_ABCD,SID,SUP_BC,SUP_BCD,A,SUP_A,ConfA->_BCD,AB,B,...,SUP_C,ConfC->AB_D,CD,D,SUP_D,ConfD->ABC_,DE,LiftA->B,LiftA->BC,LiftA->BCD
1811054,Cassandra|INPostgreSQL|PostgreSQL/OUTCassandra|,3,"('AxonFramework', 'AxonFramework') , ('apache'...",83,5,Cassandra,31,0.096774,|,INPostgreSQL,...,102,0.029412,/,OUTCassandra,9,0.333333,|#,0.269585,0.272833,4.529032
1853119,Cassandra|INMS_SQL_Server|MS_SQL_Server/OUTCas...,3,"('apache', 'flink') , ('apache', 'skywalking')...",58,4,Cassandra,31,0.096774,|,INMS_SQL_Server,...,67,0.044776,/,OUTCassandra,9,0.333333,|#,0.377419,0.390434,5.66129
1860042,Couchbase|INPostgreSQL|PostgreSQL/OUTCouchbase|,3,"('apache', 'nifi') , ('apereo', 'cas') , ('spr...",83,3,Couchbase,9,0.333333,|,INPostgreSQL,...,102,0.029412,/,OUTCouchbase,4,0.75,|#,0.928571,0.939759,26.0
1773561,HBase|INMS_SQL_Server|MS_SQL_Server/OUTHBase|,3,"('apache', 'camel') , ('apache', 'skywalking')...",58,5,HBase,24,0.125,|,INMS_SQL_Server,...,67,0.044776,/,OUTHBase,10,0.3,|#,0.4875,0.50431,5.85
1770664,HBase|INH2|H2/OUTHBase|,3,"('apache', 'ambari') , ('apache', 'camel') , (...",86,4,HBase,24,0.125,|,INH2,...,114,0.026316,/,OUTHBase,10,0.3,|#,0.332386,0.340116,7.3125
1767968,HBase|INHyperSQL|HyperSQL/OUTHBase|,4,"('apache', 'ambari') , ('apache', 'camel') , (...",51,4,HBase,24,0.166667,|,INHyperSQL,...,81,0.049383,/,OUTHBase,10,0.4,|#,0.75,0.764706,9.75
1755103,HBase|INRedis|OUTHBase/Redis|,3,"('apache', 'camel') , ('apache', 'skywalking')...",5,4,HBase,24,0.125,|,INRedis,...,10,0.3,/,Redis,90,0.033333,|#,0.35241,5.85,7.3125
440952,Hazelcast|INHyperSQL|HyperSQL/OUTHazelcast|,3,"('Activiti', 'Activiti') , ('flowable', 'flowa...",51,3,Hazelcast,34,0.088235,|,INHyperSQL,...,81,0.037037,/,OUTHazelcast,17,0.176471,|#,0.397059,0.404844,6.882353
245867,HyperSQL|INH2|OUTHyperSQL/H2|,3,"('dropwizard', 'dropwizard') , ('quarkusio', '...",10,5,HyperSQL,81,0.037037,|,INH2,...,33,0.090909,/,H2,114,0.026316,|#,0.098485,0.866667,1.733333
239703,HyperSQL|INMongoDB|OUTHyperSQL/MongoDB|,4,"('apache', 'camel') , ('geoserver', 'geoserver...",7,6,HyperSQL,81,0.049383,|,INMongoDB,...,33,0.121212,/,MongoDB,51,0.078431,|#,0.268734,1.650794,1.925926


In [5]:
PATTERN_SELECTION_MEASURES = SEQ_PATTERNS_DIR + os.sep + 'X_InY_YOutX_patternsV5.xlsx'
df = pd.read_excel(PATTERN_SELECTION_MEASURES, engine='openpyxl')


SEQUENTIAL_PROJECTS = SEQ_PATTERNS_DIR + os.sep + 'sequencial_project.txt'
projects = pd.read_csv(SEQUENTIAL_PROJECTS, sep='\t', header='infer', names=None)

df_projects = pd.DataFrame(projects)
list_projects = df_projects.columns.tolist()
#print(list_projects)
nomes_list = eval(list_projects[0])
#print(nomes_list)
new_sid=[]

def convert_to_list_of_numbers(s):
       if isinstance(s, str):  # Verifica se 's' é uma string
           # Substitui ponto e vírgula por vírgula, separa os números e converte para inteiros
           return [int(x) for x in s.replace(';', ',').split(',')]
       elif isinstance(s, int):
           return [int(s)]
       return []  
   
   # Aplica a conversão na coluna 'Lines'
df['Lines'] = df['Lines'].apply(convert_to_list_of_numbers)

for index, row in df.iterrows():
    list_projects_sid = row['Lines']
    projectsNames=""
    for project in list_projects_sid:
        if(projectsNames == ''):
            projectsNames = nomes_list[int(project)]
        else:
            projectsNames = str(projectsNames) + " , " + str(nomes_list[int(project)])
    new_sid.append(projectsNames)

#print(new_sid)

df['Lines'] = new_sid
df.to_excel(SEQ_PATTERNS_DIR + os.sep +'X_InY_YOutX_patterns_namesV5.xlsx',index=False) 