# MPG Cars

### Introduction:

The following exercise utilizes data from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)

### Step 1. Import the necessary libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("cars").getOrCreate()
spark

### Step 2. Import the first dataset [cars1](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv) and [cars2](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv).  

   ### Step 3. Assign each to a variable called cars1 and cars2

In [6]:
cars1 = spark.read.csv("cars1.csv", inferSchema=True, header=True, sep=",")
cars1.show(2)

+----+---------+------------+----------+------+------------+-----+------+--------------------+----+----+----+----+----+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car| _c9|_c10|_c11|_c12|_c13|
+----+---------+------------+----------+------+------------+-----+------+--------------------+----+----+----+----+----+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|null|null|null|null|null|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|null|null|null|null|null|
+----+---------+------------+----------+------+------------+-----+------+--------------------+----+----+----+----+----+
only showing top 2 rows



In [7]:
cars2 = spark.read.csv("cars2.csv", inferSchema=True, header=True, sep=",")
cars2.show(2)

+----+---------+------------+----------+------+------------+-----+------+--------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|           car|
+----+---------+------------+----------+------+------------+-----+------+--------------+
|33.0|        4|          91|        53|  1795|        17.4|   76|     3|   honda civic|
|20.0|        6|         225|       100|  3651|        17.7|   76|     1|dodge aspen se|
+----+---------+------------+----------+------+------------+-----+------+--------------+
only showing top 2 rows



### Step 4. Oops, it seems our first dataset has some unnamed blank columns, fix cars1

In [9]:
print(cars1.columns)

cols_to_delete = []
for c in cars1.columns:
    if c.startswith('_'):
        cols_to_delete.append(c)

print(cols_to_delete)      

['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'origin', 'car', '_c9', '_c10', '_c11', '_c12', '_c13']
['_c9', '_c10', '_c11', '_c12', '_c13']


In [13]:
cars1 = cars1.drop(*cols_to_delete)
cars1.show(2)

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
only showing top 2 rows



### Step 5. What is the number of observations in each dataset?

In [17]:
print("cars1:", cars1.count(),",", len(cars1.columns))
print("cars2:", cars2.count(),",", len(cars2.columns))

cars1: 198 , 9
cars2: 200 , 9


### Step 6. Join cars1 and cars2 into a single DataFrame called cars

In [20]:
print(cars1.columns)
print(cars2.columns)

['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'origin', 'car']
['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'origin', 'car']


In [22]:
# The difference between this function and :func:`union` is that this function
# resolves columns by name (not by position)
cars = cars1.unionByName(cars2)
cars.show(2)

+----+---------+------------+----------+------+------------+-----+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model|origin|                 car|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
|18.0|        8|         307|       130|  3504|        12.0|   70|     1|chevrolet chevell...|
|15.0|        8|         350|       165|  3693|        11.5|   70|     1|   buick skylark 320|
+----+---------+------------+----------+------+------------+-----+------+--------------------+
only showing top 2 rows



In [23]:
cars.count()

398

### Step 7. Oops, there is a column missing, called owners. Create a random number Series from 15,000 to 73,000.

In [38]:
from pyspark.sql.functions import *
import numpy as np
from pyspark.sql.types import *

In [55]:
nr_owners = list(np.random.randint(15000, high=73001, size=398, dtype = 'int'))
print(type(nr_owners))
print(type(nr_owners[0]))

<class 'list'>
<class 'numpy.int64'>


In [61]:
nr_owners = list(np.random.randint(15000, high=73001, size=398, dtype = 'int'))
print(type(nr_owners))
print(type(nr_owners[0]))

# list(map(int, ['1','2','3'])) # => [1,2,3]

nr_owners = list(map(int, nr_owners))
print(type(nr_owners))
print(type(nr_owners[0]))

<class 'list'>
<class 'numpy.int64'>
<class 'list'>
<class 'int'>


In [62]:
nr_owners1= list(map(int, list(np.random.randint(15000, high=73001, size=398, dtype = 'int'))))
print(type(nr_owners1))
print(type(nr_owners1[0]))

<class 'list'>
<class 'int'>


In [70]:
# schema = ["owners"]
owners_df = spark.createDataFrame(nr_owners,IntegerType())
owners_df = owners_df.withColumnRenamed("value","owners")
owners_df.show(5)

+------+
|owners|
+------+
| 35821|
| 67889|
| 46770|
| 68824|
| 52944|
+------+
only showing top 5 rows



### Step 8. Add the column owners to cars

In [None]:
#trying to find an answer