In [3]:

# Imports
from itertools import combinations, chain
from collections import Counter
from tqdm import tqdm


def count_combined_list(file_path, N=3):
    """
    Counts the number of combinations of N values in the list.

    WARNING.: The items in the data should be sorted!
              E.G. 3 < 5 < 15 < 55

    Args:
        file_name: string with the file path.
        N: length of combinations.

    Return:
        comb_counter: Counter with the number of all N objects groups.
    """
    
    # Load data
    with open(file_path, 'r') as f:
        data = f.readlines()

    # Combines data
    combined_list = []
    c = 0
    # For each row
    print("Creating Combinations...")
    for row in tqdm(data):
        # Cleans data
        items = row.strip('\n').strip().split(' ')
        int_items = map(int, items)
        # Creates packs of N
        combined = combinations(int_items, N)
        combined_list.append(combined)
        

    # Counter
    print("Counting Combinations...")
    comb_counter = Counter(tqdm(chain(*combined_list)))

    return comb_counter


def filter_results(C, SIGMA=4):
    """
    Return only values of the counter with frequency above threshold `SIGMA`.

    Args:
        C: Counter class to be filtered.
        SIGMA: the threshold of the counter.

    Return:
        filtered_results: Dictionary with results.
    """
    print("Filtering Results...")
    filtered_results = Counter({x: C[x] for x in tqdm(C) if C[x] >= SIGMA})
    return filtered_results


def create_output(result, OUTPUT_FILE='output.txt', ordered=True):
    """
    Creates a file of the result in the desired output format.

    Args:
        result: Counter object in the format
                Counter({(30, 31, 32): 5,
                         (36, 37, 38): 17,
                         (36, 37, 39): 10, ...}
        OUTPUT_FILE: Name of the output file.
        ordered: If the output file will be in format
    """
    # Create output file
    with open(OUTPUT_FILE, 'w+') as f:
        print("Saving File...")
        # Order the result by most frequent
        if ordered:
            ordered_result = result.most_common()
            for item in tqdm(ordered_result):
                # Write results
                f.write(str(len(item[0])) + ', ' + str(item[1]))
                # Write itens
                for it in item[0]:
                    f.write(', ')
                    f.write(str(it))
                # Write new line
                f.write('\n')
        else:
            for k, v in tqdm(result.items()):
                # Write results
                f.write(str(len(k)) + ', ' + str(v))
                # Write itens
                for it in k:
                    f.write(', ')
                    f.write(str(it))
                # Write new line
                f.write('\n')


if __name__ == "__main__":
    # SETTINGS
    # Input file local name or path
    INPUT_FILE = '/Users/chintan.desai/Documents/Chintan/DataScience/My_Mini_Projects/TransactionParser/retail_25k.dat'  
    N = 3                          # The length of combinations performed (Careful above 3)
    SIGMA = 4                      # The filter of the combined data
    # Name of the desired output file
    OUTPUT_FILE = '/Users/chintan.desai/Documents/Chintan/DataScience/My_Mini_Projects/TransactionParser/retail_25koutput.dat'     
    # Count itens
    counter = count_combined_list(INPUT_FILE, N)

    # Filter results
    filtered = filter_results(counter, SIGMA)

    # Write files
    create_output(filtered, OUTPUT_FILE)


 17%|█▋        | 4309/25000 [00:05<00:26, 777.06it/s]

Creating Combinations...


100%|██████████| 25000/25000 [00:05<00:00, 4328.81it/s]
0it [00:00, ?it/s]

Counting Combinations...


15082387it [00:16, 906286.32it/s] 
  0%|          | 40428/13992332 [00:00<01:12, 191235.35it/s]

Filtering Results...


100%|██████████| 13992332/13992332 [00:13<00:00, 1073216.70it/s]
 31%|███▏      | 23822/76151 [00:00<00:00, 120779.57it/s]

Saving File...


100%|██████████| 76151/76151 [00:00<00:00, 113408.39it/s]
