
## Instalando o PySpark no Google Colab



In [1]:
!apt-get update

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://security.ubuntu.com/ubuntu bionic-securi

In [2]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

A próxima etapa é configurar as variáveis de ambiente, pois isso habilita o ambiente do Colab a identificar corretamente onde as dependências estão rodando.

Para conseguir “manipular” o terminal e interagir como ele, você pode usar a biblioteca os.

In [3]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

import pyspark

In [4]:
!git clone https://github.com/RafaelDaddio/BigDataAulasPUC.git

Cloning into 'BigDataAulasPUC'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 124 (delta 12), reused 21 (delta 8), pack-reused 94[K
Receiving objects: 100% (124/124), 18.85 MiB | 7.71 MiB/s, done.
Resolving deltas: 100% (33/33), done.


In [5]:
#sc.stop()

In [6]:
#sc = pyspark.SparkContext.getOrCreate()
sc = pyspark.SparkContext(appName='BigDataTarefa2')

In [7]:
flights_file = '/content/flights.csv'

flights_RDD = sc.textFile(flights_file)

In [8]:
# a. Tempo total de voo de cada companhia
def TempoTotalVooCia(line):
    record = line.split(',')
    return (record[1],float(record[9]))

total_airports_visited = flights_RDD.map(TempoTotalVooCia).reduceByKey(lambda a,b: a+b).sortByKey()

for key, values in total_airports_visited.collect():
    print(f'{key}      {values}')

19393      9959718.0
19690      449242.0
19790      7747256.0
19805      6273011.0
19930      2001295.0
19977      6757883.0
20304      3625141.0
20355      4194811.0
20366      4142864.0
20398      2391168.0
20409      3009096.0
20436      689589.0
20437      799733.0
21171      916098.0
99999      1000.0


In [9]:
# b. Destino mais visado
airports_visited = {}

def most_visited(line):
    record = line.split(',')
    return (record[4], int(1))

airports_visited_RDD = flights_RDD.map(most_visited)
total_airports_visited = airports_visited_RDD.reduceByKey(lambda a,b: a+b).collect()

for airports in total_airports_visited:
    airports_visited[airports[0]] = airports[1]

max_count = int(max(airports_visited.values()))
airport_most_visited = list(airports_visited.keys())[list(airports_visited.values()).index(max_count)]

print(f'Destino mais visado: {airport_most_visited}, com {max_count} visitas.')

Destino mais visado: ATL, com 30953 visitas.


In [10]:
# c. Quais aeroportos cada companhia passou
def aeroportosOrigem(line):
    record = line.split(',')
    return (record[1],record[3])

def aeroportosDestino(line):
    record = line.split(',')
    return (record[1],record[4])

listaOrigem = flights_RDD.map(aeroportosOrigem).take(90000)
listaDestino = flights_RDD.map(aeroportosDestino).take(90000)

listaOrigem_RDD = sc.parallelize(listaOrigem)
listaDestino_RDD = sc.parallelize(listaDestino)

grouped_elementsDestino = listaOrigem_RDD.union(listaDestino_RDD).distinct().groupByKey().take(90000)
for key, values in grouped_elementsDestino:    
    print(key, list(values))

19930 ['DCA', 'EWR', 'BOS', 'PDX', 'FLL', 'ADQ', 'FAI', 'PSP', 'SNA', 'ONT', 'PHX', 'AUS', 'ATL', 'PHL', 'ANC', 'SCC', 'BRW', 'KTN', 'CDV', 'SIT', 'WRG', 'PSG', 'SAN', 'OAK', 'SMF', 'BUR', 'BLI', 'DFW', 'SLC', 'STL', 'SEA', 'LAX', 'ORD', 'MSP', 'JNU', 'YAK', 'OME', 'OTZ', 'SFO', 'LAS', 'DEN', 'IAH', 'KOA', 'ADK', 'MCO', 'BET', 'SJC', 'TUS', 'SAT', 'GEG', 'MCI', 'OGG', 'LIH', 'HNL']
20409 ['DCA', 'FLL', 'BOS', 'SJU', 'SWF', 'EWR', 'JAX', 'CLT', 'TPA', 'HOU', 'BTV', 'CHS', 'DTW', 'RIC', 'SAV', 'AUS', 'BQN', 'PVD', 'PHX', 'PDX', 'SRQ', 'PHL', 'BUF', 'STX', 'HPN', 'DFW', 'BWI', 'SAN', 'PWM', 'LGA', 'OAK', 'SMF', 'SLC', 'ABQ', 'BUR', 'STT', 'RSW', 'PBI', 'IAD', 'ORH', 'PIT', 'ORD', 'ROC', 'LAS', 'LGB', 'DEN', 'SEA', 'SFO', 'LAX', 'JFK', 'MCO', 'MSY', 'RDU', 'SYR', 'BDL', 'PSE', 'SJC']
21171 ['EWR', 'AUS', 'DCA', 'PSP', 'FLL', 'BOS', 'PDX', 'PHL', 'SAN', 'DFW', 'LAX', 'SFO', 'SEA', 'ORD', 'LAS', 'IAD', 'JFK', 'SJC', 'MCO']
19805 ['DFW', 'MIA', 'LGA', 'SLC', 'STL', 'BWI', 'SAN', 'ABQ', 'RNO',

In [11]:
# d. Vôo de maior distância de cada companhia
def DistanciasPorCia(line):
    record = line.split(',')
    return (record[1],float(record[10]))

distancias = flights_RDD.map(DistanciasPorCia).collect()
maxDist_RDD = sc.parallelize(distancias).reduceByKey(max).sortByKey()

print('Companhia  Maior distância')
for key, values in maxDist_RDD.collect():
    print(f'{key}      {values}')

Companhia  Maior distância
19393      2335.0
19690      4983.0
19790      4502.0
19805      3784.0
19930      2874.0
19977      4962.0
20304      1535.0
20355      2979.0
20366      1389.0
20398      1379.0
20409      2704.0
20436      1703.0
20437      2139.0
21171      2704.0
99999      11.0


In [12]:
# e. Qual é o vôo mais frequente de cada companhia
def voos(line):
    record = line.split(',')
    return (record[1], record[3]+record[4])

lista = flights_RDD.map(voos).collect()
lista_RDD = sc.parallelize(lista)

grouped_elements = lista_RDD.groupByKey().sortByKey()

dic_voos = {}
dic_count = {}
for i in grouped_elements.collect():
    dic_voos[i[0]] = list(i[1])
    
for cia in dic_voos.keys():
    for id_flight in dic_voos[cia]:
        if id_flight in dic_count:
            dic_count[id_flight] += 1
        else:
            dic_count[id_flight] = 1
            
    max_count = int(max(dic_count.values()))
    id_flight_max = list(dic_count.keys())[list(dic_count.values()).index(max_count)]
    dic_count.clear()
    
    print(f'Companhia: {cia} Origem: {id_flight_max[0:3]} Destino: {id_flight_max[3:6]} Qtd: {max_count}.')

Companhia: 19393 Origem: DAL Destino: HOU Qtd: 664.
Companhia: 19690 Origem: OGG Destino: HNL Qtd: 786.
Companhia: 19790 Origem: LGA Destino: ATL Qtd: 492.
Companhia: 19805 Origem: DFW Destino: LAX Qtd: 496.
Companhia: 19930 Origem: LAX Destino: SEA Qtd: 363.
Companhia: 19977 Origem: SFO Destino: ORD Qtd: 403.
Companhia: 20304 Origem: SAN Destino: LAX Qtd: 652.
Companhia: 20355 Origem: BOS Destino: DCA Qtd: 432.
Companhia: 20366 Origem: ORD Destino: CLE Qtd: 257.
Companhia: 20398 Origem: ORD Destino: CMH Qtd: 253.
Companhia: 20409 Origem: MCO Destino: JFK Qtd: 294.
Companhia: 20436 Origem: DEN Destino: LAS Qtd: 175.
Companhia: 20437 Origem: DCA Destino: ATL Qtd: 149.
Companhia: 21171 Origem: SFO Destino: LAX Qtd: 246.
Companhia: 99999 Origem: ABC Destino: CSL Qtd: 2.


In [21]:
# 2 - construir indice invertindo contendo como chave uma palavra e como valor uma lista de linhas onde ela foi menscionada

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

shakespeare_file = '/content/shakespeare.txt'
stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

def tokenizer(line):
    lines = line.split('\t',1)
    for token in lines[1].strip().replace('\t', '').split():
        if token not in stopwords and token not in punctuation:
            yield token, lines[0]

shake_RDD = sc.textFile(shakespeare_file)
word_RDD = shake_RDD.flatMap(tokenizer).groupByKey().sortByKey()

for word, lines in word_RDD.collect():
    print(word, list(lines))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
eyes:--his ['coriolanus@129931']
eyes; ['hamlet@45604', 'hamlet@122057', 'kingrichardiii@140023', 'cymbeline@89352', 'troilusandcressida@86432', 'antonyandcleopatra@100271', 'kinglear@100167', 'kinglear@103722', 'kinglear@112592', 'kinglear@125740', 'kinglear@141891', 'othello@116346', 'kinghenryv@137126', 'kinghenryv@151462', '2kinghenryvi@117405', 'winterstale@130172', '1kinghenryiv@87413', '1kinghenryiv@126611', 'romeoandjuliet@10648', 'romeoandjuliet@12337', 'kingrichardii@85669', 'measureforemeasure@98795', 'loveslabourslost@35385', 'loveslabourslost@43571', 'loveslabourslost@106727', 'titusandronicus@63601', 'muchadoaboutnothing@115412', 'merchantofvenice@24663', 'kingjohn@76715', 'juliuscaesar@115983', 'timonofathens@76755', 'periclesprinceoftyre@24321', 'twogentlemenofverona@21251', 'twogentlemenofverona@91313', 'tempest@5370', 'midsummersnightsdream@13280', 'midsummersnightsdream@28355', 'midsummersnightsdream@43

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



increasing, ['tempest@74959']
increasing: ['antonyandcleopatra@37977']
increasing; ['venusandadonis@12258']
incredible ['tamingoftheshrew@53546']
incredulous ['2kinghenryiv@127572', 'twelfthnight@72959']
incumbent ['glossary@48606']
incur ['kingrichardiii@109721', 'cymbeline@6492', 'othello@13041', 'othello@73203', 'winterstale@38173', 'tamingoftheshrew@13158', 'rapeoflucrece@67720']
incurable ['troilusandcressida@127764', 'kingjohn@98608']
incurable,-- ['allswellthatendswell@40645']
incurable. ['2kinghenryiv@25220']
incurr'd ['kinglear@140953', 'merchantofvenice@102762']
incurred ['allswellthatendswell@92529']
incursions ['1kinghenryiv@88663']
incursions, ['troilusandcressida@39423']
indebted ['2kinghenryvi@32242']
indebted, ['merchantofvenice@105192']
indeed ['hamlet@13164', 'hamlet@28804', 'hamlet@54311', 'hamlet@58967', 'hamlet@60956', 'hamlet@73173', 'hamlet@128528', 'hamlet@144488', 'kingrichardiii@87741', 'kingrichardiii@100346', 'kingrichardiii@112315', 'kingrichardiii@120013',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
shrewishly; ['twelfthnight@20867']
shrewishness; ['midsummersnightsdream@58405']
shrews, ['2kinghenryiv@143999']
shriek ['troilusandcressida@48245', 'romeoandjuliet@138840', 'kingrichardii@84661', 'juliuscaesar@43578', 'rapeoflucrece@16359']
shriek'd ['3kinghenryvi@143467']
shriek'd, ['macbeth@27799']
shriek, ['winterstale@121320']
shriek; ['midsummersnightsdream@15085']
shrieked ['merrywivesofwindsor@13572']
shrieking ['various@16975', 'loverscomplaint@857']
shrieking, ['tempest@94363']
shrieking. ['troilusandcressida@88024', 'juliuscaesar@20780']
shrieks ['winterstale@63103', 'romeoandjuliet@114773', 'macbeth@83361']
shrieks, ['periclesprinceoftyre@51682', 'venusandadonis@24650']
shrieve's ['allswellthatendswell@100872']
shrift ['romeoandjuliet@59340', 'romeoandjuliet@64198', 'romeoandjuliet@110975', 'measureforemeasure@92593']
shrift. ['3kinghenryvi@75868', 'romeoandjuliet@9135', 'romeoandjuliet@49751']
shrift; ['kingr

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 ['kinghenryv@11195', '2kinghenryvi@116657', 'kinghenryviii@24391', 'asyoulikeit@74']
usurper's ['3kinghenryvi@7595', 'macbeth@104408']
usurper, ['kingjohn@19707']
usurper. ['2kinghenryvi@20662', '2kinghenryvi@28003']
usurpers ['3kinghenryvi@83947']
usurpers, ['asyoulikeit@31152']
usurping ['kingrichardiii@158595', 'kingrichardiii@165231', 'kinghenryviii@41061', '3kinghenryvi@5936', '3kinghenryvi@10254', '3kinghenryvi@48567', 'allswellthatendswell@97002', 'kingrichardii@66314', '1kinghenryvi@19867', 'loveslabourslost@72470', 'asyoulikeit@21241', 'kingjohn@19657', 'kingjohn@19763', 'kingjohn@44847', 'tempest@119']
usurpingly ['kingjohn@1767']
usurps ['kinglear@108181', '3kinghenryvi@117737']
usurps, ['3kinghenryvi@2995']
usury, ['coriolanus@5122', 'timonofathens@59645', 'sonnets@3484', 'loverscomplaint@1725']
ut,' ['tamingoftheshrew@61896']
utensil ['twelfthnight@24647']
utensils,--for ['tempest@61331']
utility. ['kinghenryv@138867']
utmost ['kingrichardiii@160125', 'coriolanus@14487', 