In [6]:
# Install Apache Beam
!pip install apache-beam

# Import libraries
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import pandas as pd

# Load the dataset and save it for Apache Beam processing
df = pd.read_csv('player_statistics_cleaned_final.csv')
df.to_csv('player_statistics_cleaned_final.csv', index=False)

# Define pipeline options
pipeline_options = PipelineOptions()

# Define the Apache Beam pipeline
with beam.Pipeline(options=pipeline_options) as pipeline:

    # Pipeline I/O - Read the CSV file
    data = (
        pipeline
        | 'Read CSV' >> beam.io.ReadFromText('player_statistics_cleaned_final', skip_header_lines=1)
        | 'Split Columns' >> beam.Map(lambda line: line.split(','))
    )

    # Composite Transform - Extract certain fields (e.g., Player and Stats)
    class ExtractFields(beam.DoFn):
        def process(self, record):
            try:
                # Attempt to convert the third column to a float
                value = float(record[2])
                yield {'player': record[0], 'stat': record[1], 'value': value}
            except ValueError:
                # Skip records where the third column is not a number
                pass

    transformed_data = data | 'Extract Fields' >> beam.ParDo(ExtractFields())

    # Applying ParDo to process each element
    class ProcessPlayerData(beam.DoFn):
        def process(self, record):
            if record['value'] > 10:  # Example condition
                yield record

    processed_data = transformed_data | 'Process Data' >> beam.ParDo(ProcessPlayerData())

    # Write processed data to text file
    processed_data | 'Write to Text' >> beam.io.WriteToText('output/player_stats')

# After running the pipeline, display the output file content
# Read and print the results from the output text files
import glob

output_files = glob.glob('output/player_stats-*-of-*')
for file_name in output_files:
    with open(file_name, 'r') as file:
        print(file.read())






