In [83]:
import pandas as pd
from pathlib import Path

def read_and_merge_data(csv_path, json_path):
    df_csv = pd.read_csv('testDataSet1.csv', delimiter=';')
    df_json = pd.read_json('testDataSet2.json', lines=True)

    # Standardize column names
    for df in [df_csv, df_json]:
        df.columns = df.columns.str.strip().str.lower()

    merged = pd.merge(df_csv, df_json, on=['playername', 'eventtype'], how='outer')

    # Fill values from x or y columns
    merged['age'] = merged['age_x'].combine_first(merged['age_y'])
    merged['runs'] = merged['runs_x'].combine_first(merged['runs_y'])
    merged['wickets'] = merged['wickets_x'].combine_first(merged['wickets_y'])

    merged.drop(columns=[col for col in merged.columns if col.endswith('_x') or col.endswith('_y')], inplace=True)
    return merged

def clean_data(df):
    df = df.dropna(subset=['runs', 'wickets'])
    df = df[df['age'].between(15, 50)]
    return df

def classify_player(row):
    if row['runs'] > 500:
        return 'All-Rounder' if row['wickets'] > 50 else 'Batsman'
    return 'Bowler'

def process_data(df):
    df['playerType'] = df.apply(classify_player, axis=1)
    return df

def save_outputs(df, output_dir):
    output_dir.mkdir(exist_ok=True)
    df[df['eventtype'].str.upper() == 'ODI'].to_csv(output_dir / 'odi_results.csv', index=False)
    df[df['eventtype'].str.upper() == 'TEST'].to_csv(output_dir / 'test_results.csv', index=False)

def main():
    input_dir = Path('inputDataSet')
    output_dir = Path('outputDataSet')
    temp_dir = Path('temp')
    temp_dir.mkdir(exist_ok=True)

    merged = read_and_merge_data(input_dir / 'old_players.csv', input_dir / 'new_players.json')
    merged.to_csv(temp_dir / 'merged_players.csv', index=False)

    cleaned = clean_data(merged)
    processed = process_data(cleaned)
    save_outputs(processed, output_dir)

if __name__ == '__main__':
    main()


In [85]:
import pandas as pd

def classify_player(row):
    if pd.isna(row['runs']) or pd.isna(row['wickets']):
        return None
    if row['runs'] > 500:
        return 'All-Rounder' if row['wickets'] > 50 else 'Batsman'
    return 'Bowler'

def validate_results(expected_df, actual_df):
    expected_df.columns = expected_df.columns.str.strip().str.lower()
    actual_df.columns = actual_df.columns.str.strip().str.lower()

    # Rename playerType columns
    expected_df = expected_df.rename(columns={'playertype': 'playertype_expected'})
    actual_df = actual_df.rename(columns={'playertype': 'playertype_actual'})

    # Merge and compare
    merged = pd.merge(expected_df, actual_df, on=['playername', 'eventtype'])
    merged['Result'] = merged['playertype_expected'].fillna('').str.lower() == \
                       merged['playertype_actual'].fillna('').str.lower()
    merged['Result'] = merged['Result'].map({True: 'PASS', False: 'FAIL'})

    return merged[['playername', 'eventtype', 'playertype_expected', 'playertype_actual', 'Result']]

def main():
    expected = pd.read_csv('outputDataSet/odi_results.csv')
    actual = pd.read_csv('temp/merged_players.csv')

    # Clean and compute playerType
    actual.columns = actual.columns.str.strip().str.lower()
    actual = actual[actual['age'].between(15, 50)]
    actual = actual.dropna(subset=['runs', 'wickets'])
    actual['playertype'] = actual.apply(classify_player, axis=1)

    result_df = validate_results(expected, actual)
    result_df.to_csv('test_result.csv', index=False)
    print("Validation complete. Results saved to test_result.csv.")

if __name__ == '__main__':
    main()


Validation complete. Results saved to test_result.csv.
