# Using PySpark to manipulate data about the Olympic Games

In [1]:
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .master('local') \
    .appName('myFirstSession') \
    .getOrCreate()

24/09/13 19:18:06 WARN Utils: Your hostname, Daniels-MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 192.168.0.6 instead (on interface en0)
24/09/13 19:18:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/13 19:18:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.stop()

### Transformations and actions

In [11]:
from pyspark import SparkContext

sc = SparkContext(master='local', appName='TransAndActions')

rdd1 = sc.parallelize([1,2,3])
type(rdd1)

pyspark.rdd.RDD

In [12]:
rdd1.collect()

[1, 2, 3]

In [13]:
path = './files'

teamsRDD = sc.textFile(path+'/paises.csv') \
    .map(lambda line : line.split(','))

In [14]:
teamsRDD.take(5)

[['id', 'equipo', 'sigla'],
 ['1', '30. Februar', 'AUT'],
 ['2', 'A North American Team', 'MEX'],
 ['3', 'Acipactli', 'MEX'],
 ['4', 'Acturus', 'ARG']]

In [20]:
# It's better to do operations with tuples.
teamsRDD.map(lambda x: (x[2])).distinct().count() # We have 231 countries competing

231

In [23]:
# In this case since it's a groupBy, we put first the col, we want to group by,
# Here we take first the third col since we want to group by 'sigla'
teamsRDD.map(lambda x: (x[2], x[1])).groupByKey().mapValues(list).take(5)

[('sigla', ['equipo']),
 ('AUT',
  ['30. Februar',
   'Austria',
   'Austria-1',
   'Austria-2',
   'Breslau',
   'Brigantia',
   'Donar III',
   'Evita VI',
   'May-Be 1960',
   '"R.-V. Germania; Leitmeritz"',
   'Surprise']),
 ('MEX',
  ['A North American Team',
   'Acipactli',
   'Chamukina',
   'Mexico',
   'Mexico-1',
   'Mexico-2',
   'Nausikaa 4',
   'Tlaloc',
   'Xolotl']),
 ('ARG',
  ['Acturus',
   'Antares',
   'Arcturus',
   'Ardilla',
   'Argentina',
   'Argentina-1',
   'Argentina-2',
   'Blue Red',
   'Covunco III',
   'Cupidon III',
   'Djinn',
   'Gullvinge',
   'Matrero II',
   'Mizar',
   'Pampero',
   'Rampage',
   'Tango',
   'Wiking']),
 ('AFG', ['Afghanistan'])]

In [26]:
argentinaTeams = teamsRDD.filter(lambda l: 'ARG' in l)
argentinaTeams.collect()

[['4', 'Acturus', 'ARG'],
 ['37', 'Antares', 'ARG'],
 ['42', 'Arcturus', 'ARG'],
 ['43', 'Ardilla', 'ARG'],
 ['45', 'Argentina', 'ARG'],
 ['46', 'Argentina-1', 'ARG'],
 ['47', 'Argentina-2', 'ARG'],
 ['119', 'Blue Red', 'ARG'],
 ['238', 'Covunco III', 'ARG'],
 ['252', 'Cupidon III', 'ARG'],
 ['288', 'Djinn', 'ARG'],
 ['436', 'Gullvinge', 'ARG'],
 ['644', 'Matrero II', 'ARG'],
 ['672', 'Mizar', 'ARG'],
 ['774', 'Pampero', 'ARG'],
 ['843', 'Rampage', 'ARG'],
 ['1031', 'Tango', 'ARG'],
 ['1162', 'Wiking', 'ARG']]

### Checking participants in Olympic Games

In [28]:
sportmenRDD = sc.textFile(path+'/deportista.csv') \
    .map(lambda line : line.split(','))

sportmenRDD2 = sc.textFile(path+'/deportista2.csv') \
    .map(lambda line : line.split(','))

In [31]:
sportmenRDD = sportmenRDD.union(sportmenRDD2)
sportmenRDD.count()

203357

In [36]:
teamsRDD.top(1)

[['id', 'equipo', 'sigla']]

In [37]:
sportmenRDD.top(1)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id']]

In [39]:
# In order to do the join, I have to select the position of the col where 'equipo_id' exists.
sportmenRDD.map(lambda l: [l[-1], l[:-1]]) \
    .join(teamsRDD.map(lambda x : [x[0], x[2]] )) \
    .takeSample(False, 6, 25)

                                                                                

[('497', (['82495', 'Alaa Motar Hussein', '1', '23', '183', '75'], 'IRQ')),
 ('433',
  (['78422', 'Armando Melgar Retolaza', '1', '23', '172', '65'], 'GUA')),
 ('1096',
  (['128361', 'Mary Abigail Abby Wambach', '2', '24', '180', '81'], 'USA')),
 ('1019', (['3091', 'Alfred Altenburger', '1', '24', '0', '0'], 'SUI')),
 ('944', (['15020', 'Ale Brezavek', '1', '25', '184', '87'], 'SLO')),
 ('249',
  (['92707', 'Carlos Alberto Pedroso Curiel', '1', '33', '190', '88'], 'CUB'))]

In [42]:
sportmenRDD.filter(lambda l: '131505' in l).collect()

[['131505', 'Steven Woodburn', '1', '24', '185', '90', '362'],
 ['131505', 'Steven Woodburn', '1', '24', '185', '90', '362']]

### Let's see the number of medals

In [48]:
results = sc.textFile(path+'/resultados.csv') \
    .map(lambda line : line.split(','))

# We only want results that we're certain of    
winnerResults = results.filter(lambda l : 'NA' not in l[1])
winnerResults.take(5)

[['resultado_id', 'medalla', 'deportista_id', 'juego_id', 'evento_id'],
 ['4', 'Gold', '4', '2', '4'],
 ['38', 'Bronze', '15', '7', '19'],
 ['39', 'Bronze', '15', '7', '20'],
 ['41', 'Bronze', '16', '50', '14']]