In [69]:
# SectionA
# for running time:
#     m - number of rows in the Excel file
#     N - number of most common error codes you want to return
#     k - number of small file - the big file parts
#     t - number of uniqe errors - best - o(1)  / worst - o(m)



In [None]:
import pandas as pd
def from_excel_to_csv(filePath):
    df = pd.read_excel(f"{filePath}.xlsx",header=None)
    df.to_csv(f"{filePath}.csv", index=False)
    df = pd.read_csv(f"{filePath}.csv") 
    return df

In [65]:
#this function read the base excel file and transform it to scv fromat and reading again
#running time:  O(m)

In [40]:
def split_df_to_csv_files_byRange(df, range_value, path ): 
    for i in range(int(df.shape[0]/range_value)):
        df_to_csv = df[i*range_value:(i+1)*range_value]
        df_to_csv.to_csv(f"{path}{i}.csv",  index=False, header=False)


In [67]:
#this function split the big file to a few small csv files
#running time:  O(m)

In [41]:
from concurrent.futures import ThreadPoolExecutor
from collections import Counter

def count_errorFrequencies_from_files( readingPath , index ):
    df_form_log = pd.read_csv(f"{readingPath}{index}.csv",header=None)
    df_form_log[['Timestamp','Error']] = df_form_log[0].str.split(', ', expand=True)
    return df_form_log['Error'].value_counts().to_dict()


In [70]:
#this function reading one of the files and return it's error code - key:value
#running time:  O(m/k) * k - for all the files =  O(m)

In [42]:
def reading_from_files_in_parallel(filePath):
    countErrorsFrequencies = Counter()
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(count_errorFrequencies_from_files, filePath, index ) for index in range(10)]
        for future in futures:
            result = future.result()
            countErrorsFrequencies.update(result)          
    sorted_countErrorsFrequencies = sorted(countErrorsFrequencies.items(), key=lambda x: x[1], reverse=True)
    return sorted_countErrorsFrequencies

In [71]:
#this function call the prev function in parallel and calculates the sums of the results
#and sorted them - tlogt
#running time -  O(m) + O(tlogt) = O(m + tlogt)

In [51]:
def the_N_Common_error_codes(N, dict):
    n_errors = {}
    for i in range(N):
        key, value = dict[i]
        n_errors[key] = value
    return n_errors


In [72]:
#this function returns the first N value in the dictionary
#running time -  O(N)

In [52]:
def common_N_error_codes_from_file( readingPath,writingPath , N):
    df = from_excel_to_csv(readingPath)
    split_df_to_csv_files_byRange(df ,100000,writingPath)
    sorted_countErrorsFrequencies = reading_from_files_in_parallel(writingPath)
    result = the_N_Common_error_codes(N, sorted_countErrorsFrequencies)
    return result
    
    

In [1]:
#calling to the functions by order
#final running time:
#runing time best - O(m+N)
#runing time worst - O(m+N+mlogm)
#(depends on the amount of unique errors)
#Place complexity - O(m)  - saving the number of rows twice

In [53]:
res = common_N_error_codes_from_file("./files/logs.txt", "./files/logs/log" ,3 )
print(res)

{'Error: WARN_101': 200098, 'Error: ERR_404': 200094, 'Error: ERR_400': 200069}
