In [3]:
import pandas as pd
import Orange
from orangecontrib.associate.fpgrowth import *  

# Get the Data

In [2]:
label_creations_df = pd.read_csv(f'data/all_label_creations.csv')

In [4]:
# Sort revisions by properties and timestamps
label_creations_df.sort_values(["property","timestamp"],inplace=True)

# Build the dataframe

In [5]:
# Give a position for each label
#
# e.g. 1 means first label created
#      n means n label translated

last_prop,i,position = None,1,[]

for index, row in label_creations_df.copy().iterrows():
    if(row['property'] != last_prop):
        i, last_prop = 1, row['property']
    position.append(i)
    i += 1

label_creations_df['position'] = position

In [6]:
# Create a dataframe given the revision's history for each property
revisions_history_df= pd.pivot_table(data=label_creations_df,
                                     index="property",
                                     columns="position",
                                     values='language',
                                     aggfunc=lambda x: ' '.join(x))

revisions_history_df.fillna('',inplace=True)

In [7]:
# Create dictionnaries to easily encode and decode the languages
languages = label_creations_df.language.unique()
code_to_language = {i:el for i,el in enumerate(languages)}
language_to_code = {el:i for i,el in code_to_language.items()}

In [8]:
def decode_code_list(l):
    return [code_to_language[i] for i in list(l)]

In [141]:
def code_language_list(l):
    return [language_to_code[i] for i in list(l)]

In [9]:
# Convert revisions_history_df to a format accepted by Orange to mine the frequent patterns
history = []
for _,row in revisions_history_df.iterrows():
    history.append([language_to_code[el] for el in row.values if el])
    

# Frequent items analysis

In [32]:
min_support = 0.7

In [27]:
# Number of propeties
print("Number of properties: {0}".format(len(history)))

Number of properties: 190


In [26]:
# Count items 
sum_items = len(list(frequent_itemsets(history, min_support)))                
print("{0} items have a support greater than {1}".format(sum_items,min_support))

167 items have a support greater than 0.6


In [11]:
#iterable = [(support,decode_code_list(itemset)) for itemset, support in frequent_itemsets(history,min_support)]

In [33]:
def get_most_frequent(history, item_len, min_support):
    """
    Input: - history,
           - item_len, lenght of the most frequent items
           - min_support, percentage, if this value is too low you might have performance issues
    
    Output: sorted list of most frequent items
    
    """
    gen = frequent_itemsets(history,min_support)
    decoded_itemsets = [(support,decode_code_list(itemset)) for itemset, support in gen if len(itemset) == item_len]
    return list(reversed(sorted(decoded_itemsets)))

In [152]:
get_most_frequent(history, 1 ,0.25)

[(190, ['en']),
 (188, ['ar']),
 (187, ['fr']),
 (185, ['uk']),
 (147, ['nl']),
 (134, ['mk']),
 (133, ['es']),
 (123, ['de']),
 (120, ['ru']),
 (110, ['pl']),
 (110, ['ca']),
 (101, ['it']),
 (99, ['sr']),
 (82, ['zh-hans']),
 (82, ['ko']),
 (79, ['nb']),
 (77, ['hu']),
 (77, ['da']),
 (74, ['pt']),
 (66, ['ja']),
 (59, ['cs']),
 (57, ['fa']),
 (52, ['be-tarask']),
 (49, ['zh-hant']),
 (48, ['sv']),
 (48, ['he']),
 (48, ['fi'])]

# Association rules analysis

In [119]:
min_support = 0.7

In [121]:
frequent_itemsets(history,min_support)

<generator object frequent_itemsets at 0x1394daba0>

In [128]:
frozenset({75, 98})

frozenset({75, 98})

In [153]:
# association of the 4 most commun languages
cl = ['en','ar','fr','uk']
itemset = frozenset(code_language_list(cl))

In [155]:
itemsets_dict = dict(frequent_itemsets(history))
rules = association_rules(itemsets_dict,0.003,itemset)
decoded_itemsets = [(confidence, decode_code_list(antecedent),decode_code_list(consequent),support) for antecedent,consequent,support,confidence in list(rules)]
list(reversed(sorted(decoded_itemsets)))

[(1.0, ['en', 'ar', 'uk', 'ko'], ['fr'], 81),
 (1.0, ['en', 'ar', 'ko', 'fr'], ['uk'], 81),
 (1.0, ['en', 'ar', 'ko'], ['uk', 'fr'], 81),
 (1.0, ['ar', 'uk', 'ko', 'fr'], ['en'], 81),
 (1.0, ['ar', 'uk', 'ko'], ['en', 'fr'], 81),
 (1.0, ['ar', 'ko', 'fr'], ['en', 'uk'], 81),
 (1.0, ['ar', 'ko'], ['en', 'uk', 'fr'], 81),
 (0.9878048780487805, ['uk', 'ko', 'fr'], ['en', 'ar'], 81),
 (0.9878048780487805, ['uk', 'ko'], ['en', 'ar', 'fr'], 81),
 (0.9878048780487805, ['ko', 'fr'], ['en', 'ar', 'uk'], 81),
 (0.9878048780487805, ['ko'], ['en', 'ar', 'uk', 'fr'], 81),
 (0.9878048780487805, ['en', 'uk', 'ko', 'fr'], ['ar'], 81),
 (0.9878048780487805, ['en', 'uk', 'ko'], ['ar', 'fr'], 81),
 (0.9878048780487805, ['en', 'ko', 'fr'], ['ar', 'uk'], 81),
 (0.9878048780487805, ['en', 'ko'], ['ar', 'uk', 'fr'], 81),
 (0.45, ['en', 'ar', 'uk', 'fr'], ['ko'], 81),
 (0.45, ['ar', 'uk', 'fr'], ['en', 'ko'], 81),
 (0.44505494505494503, ['uk', 'fr'], ['en', 'ar', 'ko'], 81),
 (0.44505494505494503, ['en', 'uk'