In [3]:
import pandas as pd
import Orange
from orangecontrib.associate.fpgrowth import *  

# Get the Data

In [2]:
label_creations_df = pd.read_csv(f'data/all_label_creations.csv')

In [4]:
# Sort revisions by properties and timestamps
label_creations_df.sort_values(["property","timestamp"],inplace=True)

# Build the dataframe

In [5]:
# Give a position for each label
#
# e.g. 1 means first label created
#      n means n label translated

last_prop,i,position = None,1,[]

for index, row in label_creations_df.copy().iterrows():
    if(row['property'] != last_prop):
        i, last_prop = 1, row['property']
    position.append(i)
    i += 1

label_creations_df['position'] = position

In [6]:
# Create a dataframe given the revision's history for each property
revisions_history_df= pd.pivot_table(data=label_creations_df,
                                     index="property",
                                     columns="position",
                                     values='language',
                                     aggfunc=lambda x: ' '.join(x))

revisions_history_df.fillna('',inplace=True)

In [7]:
# Create dictionnaries to easily encode and decode the languages
languages = label_creations_df.language.unique()
code_to_language = {i:el for i,el in enumerate(languages)}
language_to_code = {el:i for i,el in code_to_language.items()}

In [8]:
def decode_code_list(l):
    return [code_to_language[i] for i in list(l)]

In [141]:
def code_language_list(l):
    return [language_to_code[i] for i in list(l)]

In [9]:
# Convert revisions_history_df to a format accepted by Orange to mine the frequent patterns
history = []
for _,row in revisions_history_df.iterrows():
    history.append([language_to_code[el] for el in row.values if el])
    

# Frequent items analysis

In [32]:
min_support = 0.7

In [27]:
# Number of propeties
print("Number of properties: {0}".format(len(history)))

Number of properties: 190


In [26]:
# Count items 
sum_items = len(list(frequent_itemsets(history, min_support)))                
print("{0} items have a support greater than {1}".format(sum_items,min_support))

167 items have a support greater than 0.6


In [11]:
#iterable = [(support,decode_code_list(itemset)) for itemset, support in frequent_itemsets(history,min_support)]

In [33]:
def get_most_frequent(history, item_len, min_support):
    """
    Input: - history,
           - item_len, lenght of the most frequent items
           - min_support, percentage, if this value is too low you might have performance issues
    
    Output: sorted list of most frequent items
    
    """
    gen = frequent_itemsets(history,min_support)
    decoded_itemsets = [(support,decode_code_list(itemset)) for itemset, support in gen if len(itemset) == item_len]
    return list(reversed(sorted(decoded_itemsets)))

In [157]:
get_most_frequent(history, 3,0.25)

[(185, ['en', 'ar', 'fr']),
 (183, ['en', 'ar', 'uk']),
 (182, ['en', 'uk', 'fr']),
 (180, ['ar', 'uk', 'fr']),
 (147, ['en', 'uk', 'nl']),
 (146, ['uk', 'nl', 'fr']),
 (146, ['en', 'nl', 'fr']),
 (146, ['en', 'ar', 'nl']),
 (146, ['ar', 'uk', 'nl']),
 (145, ['ar', 'nl', 'fr']),
 (133, ['es', 'uk', 'fr']),
 (133, ['en', 'mk', 'fr']),
 (133, ['en', 'es', 'uk']),
 (133, ['en', 'es', 'fr']),
 (132, ['es', 'uk', 'ar']),
 (132, ['es', 'fr', 'ar']),
 (132, ['en', 'uk', 'mk']),
 (132, ['en', 'es', 'ar']),
 (132, ['en', 'ar', 'mk']),
 (131, ['uk', 'mk', 'fr']),
 (131, ['ar', 'mk', 'fr']),
 (130, ['ar', 'uk', 'mk']),
 (124, ['es', 'uk', 'nl']),
 (124, ['es', 'nl', 'fr']),
 (124, ['en', 'es', 'nl']),
 (123, ['es', 'nl', 'ar']),
 (122, ['en', 'uk', 'de']),
 (122, ['en', 'de', 'fr']),
 (122, ['en', 'ar', 'de']),
 (121, ['uk', 'nl', 'mk']),
 (121, ['uk', 'de', 'fr']),
 (121, ['en', 'nl', 'mk']),
 (121, ['ar', 'uk', 'de']),
 (121, ['ar', 'de', 'fr']),
 (120, ['ru', 'uk', 'fr']),
 (120, ['nl', 'mk', 

# Association rules analysis

In [119]:
min_support = 0.7

In [121]:
frequent_itemsets(history,min_support)

<generator object frequent_itemsets at 0x1394daba0>

In [128]:
frozenset({75, 98})

frozenset({75, 98})

In [164]:
# association of the 4 most commun languages
cl = ['ar', 'bxr']
itemset = frozenset(code_language_list(cl))

KeyError: 'bxr'

In [161]:
itemsets_dict = dict(frequent_itemsets(history))
rules = association_rules(itemsets_dict,0.003,itemset)
decoded_itemsets = [(confidence, decode_code_list(antecedent),decode_code_list(consequent),support) for antecedent,consequent,support,confidence in list(rules)]
list(reversed(sorted(decoded_itemsets)))

[(1.0, ['en', 'ar', 'de', 'nl', 'uk'], ['fr'], 114),
 (1.0, ['en', 'ar', 'de', 'nl'], ['uk', 'fr'], 114),
 (1.0, ['en', 'ar', 'de', 'fr', 'nl'], ['uk'], 114),
 (1.0, ['ar', 'uk', 'de', 'nl'], ['en', 'fr'], 114),
 (1.0, ['ar', 'de', 'nl', 'fr'], ['en', 'uk'], 114),
 (1.0, ['ar', 'de', 'nl'], ['en', 'uk', 'fr'], 114),
 (1.0, ['ar', 'de', 'fr', 'nl', 'uk'], ['en'], 114),
 (0.991304347826087, ['uk', 'de', 'nl', 'fr'], ['en', 'ar'], 114),
 (0.991304347826087, ['uk', 'de', 'nl'], ['en', 'ar', 'fr'], 114),
 (0.991304347826087, ['en', 'uk', 'de', 'nl'], ['ar', 'fr'], 114),
 (0.991304347826087, ['en', 'de', 'nl', 'fr'], ['ar', 'uk'], 114),
 (0.991304347826087, ['en', 'de', 'nl'], ['ar', 'uk', 'fr'], 114),
 (0.991304347826087, ['en', 'de', 'fr', 'nl', 'uk'], ['ar'], 114),
 (0.991304347826087, ['de', 'nl', 'fr'], ['en', 'ar', 'uk'], 114),
 (0.991304347826087, ['de', 'nl'], ['en', 'ar', 'uk', 'fr'], 114),
 (0.95, ['en', 'ar', 'de', 'fr', 'uk'], ['nl'], 114),
 (0.95, ['ar', 'uk', 'de', 'fr'], ['en'

In [163]:
itemsets_dict = dict(frequent_itemsets(history,0.6))
rules = association_rules(itemsets_dict,0.9)
decoded_itemsets = [(confidence, decode_code_list(antecedent),decode_code_list(consequent),support) for antecedent,consequent,support,confidence in list(rules)]
list(reversed(sorted(decoded_itemsets)))

[(1.0, ['uk', 'nl', 'mk', 'fr'], ['en'], 120),
 (1.0, ['uk', 'nl', 'mk'], ['en'], 121),
 (1.0, ['uk', 'nl', 'fr'], ['en'], 146),
 (1.0, ['uk', 'nl'], ['en'], 147),
 (1.0, ['uk', 'mk', 'fr'], ['en'], 131),
 (1.0, ['uk', 'mk'], ['en'], 132),
 (1.0, ['uk', 'fr'], ['en'], 182),
 (1.0, ['uk', 'de', 'nl', 'fr'], ['en'], 115),
 (1.0, ['uk', 'de', 'nl'], ['fr'], 115),
 (1.0, ['uk', 'de', 'nl'], ['en', 'fr'], 115),
 (1.0, ['uk', 'de', 'nl'], ['en'], 115),
 (1.0, ['uk', 'de', 'fr'], ['en'], 121),
 (1.0, ['uk', 'de'], ['en'], 122),
 (1.0, ['uk'], ['en'], 185),
 (1.0, ['ru', 'uk', 'nl', 'fr'], ['en'], 117),
 (1.0, ['ru', 'uk', 'nl'], ['fr'], 117),
 (1.0, ['ru', 'uk', 'nl'], ['en', 'fr'], 117),
 (1.0, ['ru', 'uk', 'nl'], ['en'], 117),
 (1.0, ['ru', 'uk', 'fr'], ['en'], 120),
 (1.0, ['ru', 'uk'], ['fr'], 120),
 (1.0, ['ru', 'uk'], ['en', 'fr'], 120),
 (1.0, ['ru', 'uk'], ['en'], 120),
 (1.0, ['ru', 'nl', 'fr'], ['uk'], 117),
 (1.0, ['ru', 'nl', 'fr'], ['en', 'uk'], 117),
 (1.0, ['ru', 'nl', 'fr'], [