# Install Apache Beam using Pip

In [None]:
# Run and print a shell command.
def run(cmd):
  print('>> {}'.format(cmd))
  !{cmd}
  print('')

# Install apache-beam.
run('pip install --quiet apache-beam')
run('pip install --quiet apache-beam[gcp]')

print("Finished installing prerequisites")

# Create a simple Pipeline just to make sure everthing is working

### Note: 
If you get an error when running the following cell, then restart the runtime using the menu __Runtime | Restart runtime__. 

In [None]:
import apache_beam as beam
from apache_beam.io import ReadFromText, WriteToText

with beam.Pipeline() as p:
    (
        p | 'Create' >> beam.Create(['noir', 'bree', 'gigi', 'gretyl'])
          | 'Transform' >> beam.Map(str.title)
          | 'Print' >> beam.Map(print)
    )

# Get the Data

Below we are simply downloding the dataset from UCI. The data is saved to a CSV named mpg.csv. 

After running the cell, click on the folder view to see the file. 

In [None]:
import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)



raw_dataset.to_csv('mpg.csv')

### Just display the first few records. 

In [None]:
! head mpg.csv

# Below is the Pipeliine. The steps are: 

1. Read the file
2. Parse the CSV rows and return Tuples
3. One-hot encode the Origin Field
4. Convert the Tuples back into CSV rows
5. Save the results to a new file (mpg_tranformed.csv)

In [None]:
import apache_beam as beam
from apache_beam.io import ReadFromText, WriteToText


class Parse(beam.DoFn):
    def process(self,element):
        row, MPG, Cylinders, Displacement, Horsepower, Weight, Acceleration, Model_Year, Origin = element.split(',')
        try:
          # This will cast the strings to numeric values. 
          # If a record contains invalid data it will be discarded. 
          # The first column is dropped
          MPG = float(MPG)
          Cylinders = int(Cylinders)
          Displacement = float(Displacement)
          Horsepower = float(Horsepower)
          Weight = float(Weight)
          Acceleration = float(Acceleration)
          Model_Year = int(Model_Year)
          Origin = int(Origin)
          yield (MPG, Cylinders, Displacement, Horsepower, 
                 Weight, Acceleration, Model_Year, Origin)
        except:
          pass

class EncodeOrigin(beam.DoFn):
    def process(self,element):
      # This one-hot encodes the Origin Field which is element[7]
      # The values of Origin are 1, 2, or 3 (USA, Europe, or Japan)
      USA = 0
      Europe = 0
      Japan = 0
      if element[7] == 1: USA = 1
      if element[7] == 2: Europe = 1
      if element[7] == 3: Japan = 1

      result = element[:7] + (USA, Europe, Japan)
      yield result

class Format(beam.DoFn):
    def process(self,element):
      # Format the tuple as CSV for output to a file
      result = "{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}".format(
          element[0], 
          element[1],
          element[2],
          element[3],
          element[4],
          element[5],
          element[6],
          element[7],
          element[8],
          element[9])
      yield result

class Format_as_csv(beam.DoFn):
    def process(self,element):
      # Format the tuple as CSV for output to a file
      result = ','.join(map(lambda x: str(x), element))
      yield result
      
       
filename = 'mpg.csv'
with beam.Pipeline() as p:
    (
        p | 'Read' >> ReadFromText(filename)
          | 'Parse' >> beam.ParDo(Parse())
          | '1-Hot Encode Origin' >> beam.ParDo(EncodeOrigin())
          | 'Format' >> beam.ParDo(Format())
          | 'Write' >> WriteToText('mpg_tranformed', file_name_suffix=".csv")
    )


### See the results in the oputput file

In [None]:
! head mpg_tranformed-00000-of-00001.csv

# Read the Output file into a Pandas Dataset

In [None]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'USA', 'Europe', 'Japan']

dataset = pd.read_csv('mpg_tranformed-00000-of-00001.csv', names=column_names,
                          na_values='?', comment='\t',
                          sep=',', skipinitialspace=True)

dataset[:10]

### Use Pandas to Describe the File

In [None]:
dataset.describe().transpose()

### There shouldn't be any Null values, see if that is true

In [None]:
dataset.isna().sum()

### Use Seaborn to plot the relationship between fields

In [None]:
import seaborn as sns
sns.pairplot(dataset[['MPG', 'Cylinders', 'Horsepower']])