In [5]:
import pandas as pd
import numpy as np

In [13]:
def load_and_compare_csv(file1, file2):
    # Load CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Identify numerical and boolean columns
    num_cols1 = df1.select_dtypes(include=[np.number]).columns
    num_cols2 = df2.select_dtypes(include=[np.number]).columns
    bool_cols1 = df1.select_dtypes(include=[bool]).columns
    bool_cols2 = df2.select_dtypes(include=[bool]).columns

    # Find common numerical and boolean columns
    common_num_cols = set(num_cols1).intersection(num_cols2)
    common_bool_cols = set(bool_cols1).intersection(bool_cols2)

    # Compare statistics for common numerical columns
    print("Numerical Columns Comparison:")
    for col in common_num_cols:
        print(f"\nComparing column: {col}")
        print(f"{'Statistic':<10} {'File 1':<15} {'File 2':<15}")
        print("-" * 40)
        print(f"{'Min:':<10} {df1[col].min():<15.2f} {df2[col].min():<15.2f}")
        print(f"{'Max:':<10} {df1[col].max():<15.2f} {df2[col].max():<15.2f}")
        print(f"{'Median:':<10} {df1[col].median():<15.2f} {df2[col].median():<15.2f}")

    # Compare statistics for common boolean columns
    print("\nBoolean Columns Comparison:")
    for col in common_bool_cols:
        print(f"\nComparing column: {col}")
        print(f"{'Statistic':<10} {'File 1':<15} {'File 2':<15}")
        print("-" * 40)
        print(f"{'True count:':<10} {df1[col].sum():<15d} {df2[col].sum():<15d}")
        print(f"{'False count:':<10} {(~df1[col]).sum():<15d} {(~df2[col]).sum():<15d}")
        print(f"{'True %:':<10} {df1[col].mean()*100:<15.2f} {df2[col].mean()*100:<15.2f}")

    # Report columns present in only one file
    only_in_file1 = (set(num_cols1) | set(bool_cols1)) - (set(num_cols2) | set(bool_cols2))
    only_in_file2 = (set(num_cols2) | set(bool_cols2)) - (set(num_cols1) | set(bool_cols1))

    if only_in_file1:
        print(f"\nColumns only in {file1}: {', '.join(only_in_file1)}")
    if only_in_file2:
        print(f"\nColumns only in {file2}: {', '.join(only_in_file2)}")

# Usage example
file1 = 'scored_test_dataset.csv' 
file2 = 'scored_fake_dataset.csv'
load_and_compare_csv(file1, file2)

Numerical Columns Comparison:

Comparing column: jaccard_similarity
Statistic  File 1          File 2         
----------------------------------------
Min:       0.02            0.17           
Max:       0.25            0.87           
Median:    0.11            0.36           

Comparing column: recall
Statistic  File 1          File 2         
----------------------------------------
Min:       0.00            0.00           
Max:       1.00            1.00           
Median:    1.00            1.00           

Comparing column: cosine_similarity
Statistic  File 1          File 2         
----------------------------------------
Min:       0.75            0.95           
Max:       0.96            1.00           
Median:    0.91            0.98           

Boolean Columns Comparison:

Comparing column: unit_test
Statistic  File 1          File 2         
----------------------------------------
True count: 16              45             
False count: 18              0              

In [14]:
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

print(df1['recall'].describe() )

count    34.000000
mean      0.823529
std       0.386953
min       0.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: recall, dtype: float64


In [15]:
print(df2['recall'].describe() )

count    45.000000
mean      0.688889
std       0.468179
min       0.000000
25%       0.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: recall, dtype: float64


In [16]:
sum(df1['recall'])

28.0