
## Instalando o PySpark no Google Colab



In [1]:
!apt-get update

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/graphics

In [2]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

A próxima etapa é configurar as variáveis de ambiente, pois isso habilita o ambiente do Colab a identificar corretamente onde as dependências estão rodando.

Para conseguir “manipular” o terminal e interagir como ele, você pode usar a biblioteca os.

In [3]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

import pyspark

In [4]:
!git clone https://github.com/RafaelDaddio/BigDataAulasPUC.git

fatal: destination path 'BigDataAulasPUC' already exists and is not an empty directory.


In [5]:
#sc.stop()

In [6]:
#sc = pyspark.SparkContext.getOrCreate()
sc = pyspark.SparkContext(appName='BigDataTarefa2')

In [7]:
flights_file = '/content/flights.csv'

flights_RDD = sc.textFile(flights_file)

In [8]:
# a. Tempo total de voo de cada companhia
def TempoTotalVooCia(line):
    record = line.split(',')
    return (record[1],float(record[9]))

total_airports_visited = flights_RDD.map(TempoTotalVooCia).reduceByKey(lambda a,b: a+b).sortByKey()

for key, values in total_airports_visited.collect():
    print(f'{key}      {values}')

19393      9959718.0
19690      449242.0
19790      7747256.0
19805      6273011.0
19930      2001295.0
19977      6757883.0
20304      3625141.0
20355      4194811.0
20366      4142864.0
20398      2391168.0
20409      3009096.0
20436      689589.0
20437      799733.0
21171      916098.0
99999      1000.0


In [9]:
# b. Destino mais visado
airports_visited = {}

def most_visited(line):
    record = line.split(',')
    return (record[4], int(1))

airports_visited_RDD = flights_RDD.map(most_visited)
total_airports_visited = airports_visited_RDD.reduceByKey(lambda a,b: a+b).collect()

for airports in total_airports_visited:
    airports_visited[airports[0]] = airports[1]

max_count = int(max(airports_visited.values()))
airport_most_visited = list(airports_visited.keys())[list(airports_visited.values()).index(max_count)]

print(f'Destino mais visado: {airport_most_visited}, com {max_count} visitas.')

Destino mais visado: ATL, com 30953 visitas.


In [10]:
# c. Quais aeroportos cada companhia passou
def aeroportosOrigem(line):
    record = line.split(',')
    return (record[1],record[3])

def aeroportosDestino(line):
    record = line.split(',')
    return (record[1],record[4])

listaOrigem = flights_RDD.map(aeroportosOrigem).take(90000)
listaDestino = flights_RDD.map(aeroportosDestino).take(90000)

listaOrigem_RDD = sc.parallelize(listaOrigem)
listaDestino_RDD = sc.parallelize(listaDestino)

grouped_elementsDestino = listaOrigem_RDD.union(listaDestino_RDD).distinct().groupByKey().take(90000)
for key, values in grouped_elementsDestino:    
    print(key, list(values))

19930 ['DCA', 'EWR', 'BOS', 'PDX', 'FLL', 'ADQ', 'FAI', 'PSP', 'SNA', 'ONT', 'PHX', 'AUS', 'ATL', 'PHL', 'ANC', 'SCC', 'BRW', 'KTN', 'CDV', 'SIT', 'WRG', 'PSG', 'SAN', 'OAK', 'SMF', 'BUR', 'BLI', 'DFW', 'SLC', 'STL', 'SEA', 'LAX', 'ORD', 'MSP', 'JNU', 'YAK', 'OME', 'OTZ', 'SFO', 'LAS', 'DEN', 'IAH', 'KOA', 'ADK', 'MCO', 'BET', 'SJC', 'TUS', 'SAT', 'GEG', 'MCI', 'OGG', 'LIH', 'HNL']
20409 ['DCA', 'FLL', 'BOS', 'SJU', 'SWF', 'EWR', 'JAX', 'CLT', 'TPA', 'HOU', 'BTV', 'CHS', 'DTW', 'RIC', 'SAV', 'AUS', 'BQN', 'PVD', 'PHX', 'PDX', 'SRQ', 'PHL', 'BUF', 'STX', 'HPN', 'DFW', 'BWI', 'SAN', 'PWM', 'LGA', 'OAK', 'SMF', 'SLC', 'ABQ', 'BUR', 'STT', 'RSW', 'PBI', 'IAD', 'ORH', 'PIT', 'ORD', 'ROC', 'LAS', 'LGB', 'DEN', 'SEA', 'SFO', 'LAX', 'JFK', 'MCO', 'MSY', 'RDU', 'SYR', 'BDL', 'PSE', 'SJC']
21171 ['EWR', 'AUS', 'DCA', 'PSP', 'FLL', 'BOS', 'PDX', 'PHL', 'SAN', 'DFW', 'LAX', 'SFO', 'SEA', 'ORD', 'LAS', 'IAD', 'JFK', 'SJC', 'MCO']
19805 ['DFW', 'MIA', 'LGA', 'SLC', 'STL', 'BWI', 'SAN', 'ABQ', 'RNO',

In [11]:
# d. Vôo de maior distância de cada companhia
def DistanciasPorCia(line):
    record = line.split(',')
    return (record[1],float(record[10]))

distancias = flights_RDD.map(DistanciasPorCia).collect()
maxDist_RDD = sc.parallelize(distancias).reduceByKey(max).sortByKey()

print('Companhia  Maior distância')
for key, values in maxDist_RDD.collect():
    print(f'{key}      {values}')

Companhia  Maior distância
19393      2335.0
19690      4983.0
19790      4502.0
19805      3784.0
19930      2874.0
19977      4962.0
20304      1535.0
20355      2979.0
20366      1389.0
20398      1379.0
20409      2704.0
20436      1703.0
20437      2139.0
21171      2704.0
99999      11.0


In [12]:
# e. Qual é o vôo mais frequente de cada companhia
def voos(line):
    record = line.split(',')
    return (record[1], record[3]+record[4])

lista = flights_RDD.map(voos).collect()
lista_RDD = sc.parallelize(lista)

grouped_elements = lista_RDD.groupByKey().sortByKey()

dic_voos = {}
dic_count = {}
for i in grouped_elements.collect():
    dic_voos[i[0]] = list(i[1])
    
for cia in dic_voos.keys():
    for id_flight in dic_voos[cia]:
        if id_flight in dic_count:
            dic_count[id_flight] += 1
        else:
            dic_count[id_flight] = 1
            
    max_count = int(max(dic_count.values()))
    id_flight_max = list(dic_count.keys())[list(dic_count.values()).index(max_count)]
    dic_count.clear()
    
    print(f'Companhia: {cia} Origem: {id_flight_max[0:3]} Destino: {id_flight_max[3:6]} Qtd: {max_count}.')

Companhia: 19393 Origem: DAL Destino: HOU Qtd: 664.
Companhia: 19690 Origem: OGG Destino: HNL Qtd: 786.
Companhia: 19790 Origem: LGA Destino: ATL Qtd: 492.
Companhia: 19805 Origem: DFW Destino: LAX Qtd: 496.
Companhia: 19930 Origem: LAX Destino: SEA Qtd: 363.
Companhia: 19977 Origem: SFO Destino: ORD Qtd: 403.
Companhia: 20304 Origem: SAN Destino: LAX Qtd: 652.
Companhia: 20355 Origem: BOS Destino: DCA Qtd: 432.
Companhia: 20366 Origem: ORD Destino: CLE Qtd: 257.
Companhia: 20398 Origem: ORD Destino: CMH Qtd: 253.
Companhia: 20409 Origem: MCO Destino: JFK Qtd: 294.
Companhia: 20436 Origem: DEN Destino: LAS Qtd: 175.
Companhia: 20437 Origem: DCA Destino: ATL Qtd: 149.
Companhia: 21171 Origem: SFO Destino: LAX Qtd: 246.
Companhia: 99999 Origem: ABC Destino: CSL Qtd: 2.


In [37]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

shakespeare_file = '/content/shakespeare.txt'

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
text_file = sc.textFile(shakespeare_file)
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
print(counts)
for t in counts.take(10):
    print(f'{t}')

PythonRDD[146] at RDD at PythonRDD.scala:53
('hamlet@9\t', 1)
('hamlet@10\t\tDRAMATIS', 1)
('PERSONAE', 21)
('hamlet@30\t', 1)
('of', 10001)
('(KING', 8)
('CLAUDIUS:)', 1)
('late,', 20)
('nephew', 9)
('hamlet@132\tPOLONIUS\tlord', 1)


In [40]:
def tokenizer(line):
    for token in nltk.word_tokenize(line):
        if token not in stopwords and token not in punctuation:
            yield token.lower()

shakespeare_RDD = sc.textFile(shakespeare_file)
tokens_RDD = shakespeare_RDD.flatMap(tokenizer)
key_tokens_RDD = tokens_RDD.map(lambda x: (x,1))
token_counts = key_tokens_RDD.reduceByKey(lambda a, b: a+b).sortByKey()

print(token_counts.collect())

