In [33]:
import timeit
import numpy as np
import pandas as pd

In [34]:
class Summary():
    '''
        Class Summary aims to calculate the summary values of lists, in an exploratory data analysis.
        
        ## Parameter: list.
        
        The calculated values are:
         
            - Sorted
            - # values
            - Sum
            - Mean
            - Median
            - Variance
            - Standard deviation
            - Normal Distribution
            - Minimum
            - Minor Outlier
            - Minor
            - Quartile 1
            - Quartile 2
            - Quartile 3
            - Major
            - Major Outlier
            - Maximum
        
    '''
    def __init__(self, list_to_summary):
        try:
            self.list_to_summary = list_to_summary
        except Exception:
            return 'Error: Only lists can be used as a parameter for Summary class.'
    
    
    def sort_list(self):
        '''
            Method for organizing the list, from the lowest value to the highest value.
            
        '''
        list_org_asc = self.list_to_summary[:]
        count = 1
        for k1 in range(len(list_org_asc) - 1):
            count += 1
            for k2 in range(len(list_org_asc) - 1):
                if list_org_asc[k2] > list_org_asc[k2 + 1]:
                    list_org_asc[k2], list_org_asc[k2 + 1] = list_org_asc[k2 + 1], list_org_asc[k2]
                    
                    
        list_org_desc = self.list_to_summary[:]
#         count = 1
        for k1 in range(len(list_org_desc) - 1):
#             count += 1
            for k2 in range(len(list_org_desc) - 1):
                if list_org_desc[k2] < list_org_desc[k2 + 1]:
                    list_org_desc[k2 + 1], list_org_desc[k2] = list_org_desc[k2], list_org_desc[k2 + 1]
                    
                    
        dict_result = {
            '# values': count,
            'Sorted asc': list_org_asc,
            'Sorted desc': list_org_desc
        }
        return dict_result
    
    
    def median_list(self):
        '''
            Calculation of median and the number of values of list_to_summary.
            
            Return a dictonary with:
                # values
                Median
            
        '''
        list_org = Summary(self.list_to_summary).sort_list()['Sorted asc']
        count = Summary(self.list_to_summary).sort_list()['# values']
        if count%2 == 1:
            list_mediana = list_org[int(count/2)]
        else:
            list_mediana = (list_org[int(count/2)] - list_org[int(count/2) - 1])/2 + list_org[int(count/2) - 1]
        dict_result = {
            '# values': count,
            'Median': round(list_mediana, 2)
        }
        return dict_result
    
    
    def mean_sum_list(self):
        '''
            Calculation of mean and sum of values of list_to_summary.
            
            Return a dictonary with:
                Mean
                Sum
            
        '''
        list_org = self.list_to_summary
        count = 0
        sum_all = 0
        for k, v in enumerate(list_org):
            sum_all += v
            count += 1
        dict_result = {
            'Mean': round(sum_all/count, 2),
            'Sum': sum_all
        }
        return dict_result
    
    
    def max_min_list(self):
        '''
            Calculation of maximum and minimum values of list_to_summary.
            
            Return a dictonary with:
                Maximum
                Minimum
            
        '''
        list_org = Summary(self.list_to_summary).sort_list()['Sorted asc']
        dict_result = {
            'Maximum': list_org[-1],
            'Minimum': list_org[0],
        }
        return dict_result
    
    
    def quartile_list(self):
        '''
            Calculation of quartile of values of list_to_summary.
            
            Return a dictonary with:
                Minor Outlier
                Minor
                Quartile 1
                Quartile 2
                Quartile 3
                Major
                Major Outlier
            
        '''
        list_org = Summary(self.list_to_summary).sort_list()['Sorted asc']
        count = Summary(self.list_to_summary).sort_list()['# values']
        
        q2 = Summary(self.list_to_summary).median_list()['Median']
        
        if count%2 == 1:
            q1 = Summary(list_org[:int(count/2)+1]).median_list()['Median']
            q3 = Summary(list_org[int(count/2):]).median_list()['Median']
        else:
            q1 = Summary(list_org[:int(count/2)]).median_list()['Median']
            q3 = Summary(list_org[int(count/2):]).median_list()['Median']
        
        menor = ((q2 - q1) * 1.5) - q1
        maior = ((q3 - q2) * 1.5) + q3
        
        if menor <= list_org[0]:
            menor = list_org[0]        
        min_out = list()
        count_out = 0
        for c in list_org:
            if c < menor:
                min_out.append(c)
                count_out += 1
        
        if maior >= list_org[-1]:
            maior = list_org[-1]
        max_out = list()
        count_out = 0
        for c in list_org:
            if c > maior:
                max_out.append(c)
                count_out += 1
        
        dict_result = {
            'Minor Outlier': min_out,
            'Minor': menor,
            'Q1': q1,
            'Q2': q2,
            'Q3': q3,
            'Major': maior,
            'Major Outlier': max_out
        }
        return dict_result
    
    
    def variance_list(self):
        '''
            Calculation of variance and standard deviation of values of list_to_summary.
            
            Return a dictonary with:
                Variance
                Standard deviation
            
        '''
        list_org = Summary(self.list_to_summary).sort_list()['Sorted asc']
        count = Summary(self.list_to_summary).sort_list()['# values']
        mean = Summary(self.list_to_summary).mean_sum_list()['Mean']
        sum_variance = 0
        for c in list_org:
            sum_variance += (c - mean)**2
        variance = round(sum_variance/count)
        deviation = round(variance ** 0.5)
        dict_result = {
            'Variance': variance,
            'Standard deviation': deviation,
        }
        return dict_result

    
    def description_list(self):
        '''
            Calculation all values in the class of Summary from the list_to_summary.
            
            Return a dictonary with:
                Sorted
                # values
                Sum
                Mean
                Median
                Variance
                Standard deviation
                Normal Distribution
                Minimum
                Minor Outlier
                Minor
                Q1
                Q2
                Q3
                Major
                Major Outlier
                Maximum
            
        '''
        list_org_asc = Summary(self.list_to_summary).sort_list()['Sorted asc']
        list_org_desc = Summary(self.list_to_summary).sort_list()['Sorted desc']
        count = Summary(self.list_to_summary).sort_list()['# values']
        list_mediana = Summary(self.list_to_summary).median_list()['Median']
        sum_all = Summary(self.list_to_summary).mean_sum_list()['Sum']
        mean = Summary(self.list_to_summary).mean_sum_list()['Mean']
        min_out = Summary(self.list_to_summary).quartile_list()['Minor Outlier']
        minor = Summary(self.list_to_summary).quartile_list()['Minor']
        q1 = Summary(self.list_to_summary).quartile_list()['Q1']
        q2 = Summary(self.list_to_summary).quartile_list()['Q2']
        q3 = Summary(self.list_to_summary).quartile_list()['Q3']
        major = Summary(self.list_to_summary).quartile_list()['Major']
        max_out = Summary(self.list_to_summary).quartile_list()['Major Outlier']
        variance = Summary(self.list_to_summary).variance_list()['Variance']
        stdeviation = Summary(self.list_to_summary).variance_list()['Standard deviation']
        
        if len(max_out) == 0 & len(min_out) == 0:
            normal = 'Yes'
        else:
            normal = 'No'
        
        dict_result = {
            'Sorted asc': list_org_asc,
            'Sorted desc': list_org_desc,
            '# values': count,
            'Sum': sum_all,
            'Mean': mean,
            'Median': list_mediana,
            'Variance': variance,
            'Standard deviation': stdeviation,
            'Normal Distribution': normal,
            'Minimum': list_org_asc[0],
            'Minor Outlier': min_out,
            'Minor': minor,
            'Q1': q1,
            'Q2': q2,
            'Q3': q3,
            'Major': major,
            'Major Outlier': max_out,
            'Maximum': list_org_asc[-1],
        }
        return dict_result
        

In [35]:
class NumpySummary():
    '''
        Class NumpySummary aims to calculate the summary values of lists, in an exploratory data analysis, with numpy library.
        
        ## Parameter: list.
        
        The calculated values are:
         
            - Sorted
            - # values
            - Sum
            - Mean
            - Median
            - Variance
            - Standard deviation
            - Normal Distribution
            - Minimum
            - Minor Outlier
            - Minor
            - Quartile 1
            - Quartile 2
            - Quartile 3
            - Major
            - Major Outlier
            - Maximum
        
    '''
    def __init__(self, list_to_summary):
        if type(list_to_summary) != list:
            return TypeError, 'Error: Only lists can be used as a parameter for Summary class.'
        else:
            self.list_to_summary = list_to_summary
            
    
    
    def np_resumos(self):
        list_to_summary = self.list_to_summary[:]
        q1 = round(np.quantile(list_to_summary, .25), 1)
        q2 = round(np.quantile(list_to_summary, .5), 1)
        q3 = round(np.quantile(list_to_summary, .75), 1)
        minor = ((q2 - q1) * 1.5) - q1
        major = ((q3 - q2) * 1.5) + q3

        if minor <= np.min(list_to_summary):
            minor = np.min(list_to_summary)
        min_out = list()
        count_out = 0
        for c in np.sort(list_to_summary):
            if c < minor:
                min_out.append(c)
                count_out += 1


        if major >= np.max(list_to_summary):
            major = np.max(list_to_summary)
        max_out = list()
        count_out = 0
        for c in np.sort(list_to_summary):
            if c > major:
                max_out.append(c)
                count_out += 1

        if len(max_out) == 0 & len(min_out):
            normal = 'Yes'
        else:
            normal = 'No'

        data = [
            np.sort(list_to_summary),
            np.sort(list_to_summary)[::-1],
            len(list_to_summary),
            np.sum(list_to_summary),
            round(np.mean(list_to_summary), 2),
            round(np.median(list_to_summary), 2),
            round(np.var(list_to_summary), 2),
            round(np.std(list_to_summary), 2),
            normal,
            np.min(list_to_summary),
            min_out,
            minor,
            q1,
            np.median(list_to_summary),
            q3,
            major,
            max_out,
            np.max(list_to_summary),
        ]
        return data

In [36]:
# The list to be analyzed. Just an example, the methods can receive any list of integers.
l = [-20,1,3,-90,2,4,6,5,8,4,9,200,190,456]
l

[-20, 1, 3, -90, 2, 4, 6, 5, 8, 4, 9, 200, 190, 456]

In [38]:
# Inserting the list in the Summary class
resumo_l = Summary(l)
resumo_l

<__main__.Summary at 0x74a7730>

In [39]:
print(resumo_l.__doc__)


        Class Summary aims to calculate the summary values of lists, in an exploratory data analysis.
        
        ## Parameter: list.
        
        The calculated values are:
         
            - Sorted
            - # values
            - Sum
            - Mean
            - Median
            - Variance
            - Standard deviation
            - Normal Distribution
            - Minimum
            - Minor Outlier
            - Minor
            - Quartile 1
            - Quartile 2
            - Quartile 3
            - Major
            - Major Outlier
            - Maximum
        
    


In [40]:
# Since all the returns of the methods of the class are dictionaries, we will use loop for to observe the results.
# Total values in the list, ascending and descending organization
d = resumo_l.sort_list().items()
for k, v in d:
    print(f'{k} _ {v}')

# values _ 14
Sorted asc _ [-90, -20, 1, 2, 3, 4, 4, 5, 6, 8, 9, 190, 200, 456]
Sorted desc _ [456, 200, 190, 9, 8, 6, 5, 4, 4, 3, 2, 1, -20, -90]


In [41]:
# Total value and median
d = resumo_l.median_list().items()
for k, v in d:
    print(f'{k} _ {v}')

# values _ 14
Median _ 4.5


In [42]:
# Average and sum of values
d = resumo_l.mean_sum_list().items()
for k, v in d:
    print(f'{k} _ {v}')

Mean _ 55.57
Sum _ 778


In [43]:
# Maximum and minimum list value
d = resumo_l.max_min_list().items()
for k, v in d:
    print(f'{k} _ {v}')

Maximum _ 456
Minimum _ -90


In [44]:
# Distribution values
d = resumo_l.quartile_list().items()
for k, v in d:
    print(f'{k} _ {v}')

Minor Outlier _ [-90, -20, 1]
Minor _ 1.75
Q1 _ 2
Q2 _ 4.5
Q3 _ 9
Major _ 15.75
Major Outlier _ [190, 200, 456]


In [45]:
# Variance and standard deviation of values
d = resumo_l.variance_list().items()
for k, v in d:
    print(f'{k} _ {v}')

Variance _ 17825
Standard deviation _ 134


In [46]:
# All class values which describe the exploratory analysis of data in the list
d = resumo_l.description_list().items()
for k, v in d:
    print(f'{k} _ {v}')

Sorted asc _ [-90, -20, 1, 2, 3, 4, 4, 5, 6, 8, 9, 190, 200, 456]
Sorted desc _ [456, 200, 190, 9, 8, 6, 5, 4, 4, 3, 2, 1, -20, -90]
# values _ 14
Sum _ 778
Mean _ 55.57
Median _ 4.5
Variance _ 17825
Standard deviation _ 134
Normal Distribution _ No
Minimum _ -90
Minor Outlier _ [-90, -20, 1]
Minor _ 1.75
Q1 _ 2
Q2 _ 4.5
Q3 _ 9
Major _ 15.75
Major Outlier _ [190, 200, 456]
Maximum _ 456


In [47]:
# Obtaining a dictionary for structuring data in Panda.DataFrame, in order to compare values.
columns = list(resumo_l.description_list().keys())
data1 = list(resumo_l.description_list().values())

# Obtaining the descriptive data in numpy, for comparative analysis with the developed class.
data2 = NumpySummary(l).np_resumos()

# Construction of the DataFrame to compare the data between the constituted class and numpy.
df = pd.DataFrame(data=[data1, data2], columns=columns, index=['Summary Class','Numpy Class'])
df

Unnamed: 0,Sorted asc,Sorted desc,# values,Sum,Mean,Median,Variance,Standard deviation,Normal Distribution,Minimum,Minor Outlier,Minor,Q1,Q2,Q3,Major,Major Outlier,Maximum
Summary Class,"[-90, -20, 1, 2, 3, 4, 4, 5, 6, 8, 9, 190, 200...","[456, 200, 190, 9, 8, 6, 5, 4, 4, 3, 2, 1, -20...",14,778,55.57,4.5,17825.0,134.0,No,-90,"[-90, -20, 1]",1.75,2.0,4.5,9.0,15.75,"[190, 200, 456]",456
Numpy Class,"[-90, -20, 1, 2, 3, 4, 4, 5, 6, 8, 9, 190, 200...","[456, 200, 190, 9, 8, 6, 5, 4, 4, 3, 2, 1, -20...",14,778,55.57,4.5,17825.24,133.51,No,-90,"[-90, -20, 1]",1.25,2.2,4.5,8.8,15.25,"[190, 200, 456]",456


In [48]:
# Summary Class runtime analysis
%timeit Summary(l).description_list()

2.31 ms ± 25.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [49]:
# Numpy runtime analyses
%timeit NumpySummary(l).np_resumos()

555 µs ± 41.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
