### Integrantes do grupo:
- Carlos Zerwes Amado Sette
- Igor Alejandro Sousa Santos
- João Pedro Brandimarte Viccari

**Instruções iniciais**

*   Abra os links dos dados:
    * https://tinyurl.com/bigdata-mcu
*   Clique em "Adicionar atalho ao Drive"


# Solução

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Configuração do ambiente

In [None]:
# !pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

from datetime import datetime

appName = 'Big Data'
master = 'local[*]'

spark = SparkSession.builder     \
    .master(master) \
    .appName(appName) \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

## Leitura de dados

In [None]:
# Usar esta entrada para testes
# input_data = spark.sparkContext.textFile('file:///content/drive/My Drive/mcu/mcu_subset.csv')

In [None]:
# Usar esta entrada para entrega final
input_data = spark.sparkContext.textFile('file:///content/drive/My Drive/mcu/mcu.csv')

In [None]:
input_data.take(10)

[';character;line;movie;year;words;Adam McKay;Anna Boden;Art Marcum;Ashley Edward Miller;Chris McKenna;Christopher Ford;Christopher Markus;Christopher Yost;Craig Kyle;Don Payne;Drew Pearce;Edgar Wright;Eric Pearson;Erik Sommers;Geneva Robertson-Dworet;Hawk Ostby;James Gunn;Joe Cornish;Joe Robert Cole;John Francis Daley;Jon Watts;Jonathan Goldstein;Joss Whedon;Justin Theroux;Mark Fergus;Matt Holloway;Paul Rudd;Ryan Coogler;Ryan Fleck;Shane Black;Stephen McFeely;Zack Stentz',
 '0;TONY STARK;Oh, I get it.  You guys aren’t allowed to talk.  Is that it?  Are you not allowed to talk?;Iron Man;2008;22;False;False;True;False;False;False;False;False;False;False;False;False;False;False;False;True;False;False;False;False;False;False;False;False;True;True;False;False;False;False;False;False',
 '1;IRON MAN JIMMY;No.  We’re allowed to talk.;Iron Man;2008;6;False;False;True;False;False;False;False;False;False;False;False;False;False;False;False;True;False;False;False;False;False;False;False;False;Tru

In [None]:
cnt = input_data.count()

print("Total de linhas:",cnt)

Total de linhas: 15737


In [None]:
input_data.getNumPartitions()

2

In [None]:
# Criar função para separar linhas do arquivo lido em chaves {'character':'line'}
def char_line(linha):
  campos = linha.split(';')
  character = campos[1].lower()
  line = campos[2]
  return (character, line)


In [None]:
char_line("98;JAMES RHODES;Something’s...seriously wrong with you, man. ;Iron Man;2008;6;False;False;True;False;False;False;False;False;False;False;False;False;False;False;False;True;False;False;False;False;False;False;False;False;True;True;False;False;False;False;False;False")

('james rhodes', 'Something’s...seriously wrong with you, man. ')

## Exemplo de uso do pipeline

In [None]:
from transformers import pipeline

# Baixar e configurar pipeline do modelo
sentiment = pipeline('sentiment-analysis', device=0)



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [None]:
result = sentiment("I am Iron Man")


In [None]:
result

[{'label': 'POSITIVE', 'score': 0.999076247215271}]

In [None]:
result[0]['label']

'POSITIVE'

In [None]:
# Criar função para converter POSITIVE em 1 e NEGATIVE em -1
def convert_label(dict_label):
  if dict_label[0]['label'] == 'POSITIVE':
    dict_label[0]['label'] = 1
  else: dict_label[0]['label'] = -1


In [None]:
convert_label(result)

In [None]:
result

[{'label': 1, 'score': 0.999076247215271}]

In [None]:
result_2 = sentiment("Guess what, Cap, we lost, and you weren’t there.")
result_2

[{'label': 'NEGATIVE', 'score': 0.999036431312561}]

In [None]:
convert_label(result_2)
print(result_2[0]['label'])
print(result_2[0]['score'])

-1
0.999036431312561


## Solução

In [None]:
# Inclua outros personagens de sua escolha
# characters = {'tony stark', 'steve rogers', 'thanos', 'bruce banner','hulk', 'jarvis', 'vision', 'peter parker', 'loki', 'pepper potts'}

In [None]:
import re

# Modifique a solução para implementar a função Map
def line_sentiment(line):
  campos = line.split(";")

  # Criar uma condiçao de acesso pois a funçao flatmat estava tentando acessar indices inexistentes
  if len(campos) >= 3:  # garantir pelo menos 3 campos (index 0, 1, 2)
    personagem = campos[1].lower()
    characters = {'tony stark', 'steve rogers', 'thanos', 'bruce banner', 'hulk', 'jarvis', 'vision', 'peter parker', 'loki', 'pepper potts'}

    # Inicializar com valores padrão
    polaridade = None
    contagem = 0

    if personagem in characters:
      polaridade = sentiment(campos[2])
      convert_label(polaridade)
      contagem = 1

    if polaridade is not None:
      yield (personagem, (polaridade, contagem))
  else:
    pass  # Pula a linha

In [None]:
result = line_sentiment("98;TONY STARK;Something’s...seriously wrong with you, man. ;Iron Man;2008;6;False;False;True;False;False;False;False;False;False;False;False;False;False;False;False;True;False;False;False;False;False;False;False;False;True;True;False;False;False;False;False;False")
print(list(result))

[('tony stark', ([{'label': -1, 'score': 0.9991362690925598}], 1))]


In [None]:
s = input_data.flatMap(line_sentiment)

In [None]:
s.take(10)

[('tony stark', ([{'label': -1, 'score': 0.9937212467193604}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.999362051486969}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.9592078924179077}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.9994257688522339}], 1)),
 ('tony stark', ([{'label': -1, 'score': 0.914114236831665}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.9987884163856506}], 1)),
 ('tony stark', ([{'label': -1, 'score': 0.9971954822540283}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.9401814341545105}], 1)),
 ('tony stark', ([{'label': 1, 'score': 0.9965348243713379}], 1)),
 ('tony stark', ([{'label': -1, 'score': 0.9705791473388672}], 1))]

In [None]:
# testando como acessar os indices
data = [('tony stark', ([{'label': -1, 'score': 0.9881800413131714}], 1))]
# print(data[0])
# print(data[0][1])
# print(data[0][1][0])
# print(data[0][1][1])
# print(data[0][1][0][0])
# print(data[0][1][0][0]['label'])


In [None]:
# Implemente e aplique um método reduce para acumulação dos sentimentos dos personagens
def accumulate_sentiment(acc,v):
  acc_nota = acc[0]
  acc_contagem = acc[1]
  v_nota = v[0]
  v_contagem = v[1]
  return (acc_nota+v_nota, acc_contagem+v_contagem)

In [None]:
rdd_simplificado = s.mapValues(lambda x: (x[0][0]['label'], x[1]))
m = rdd_simplificado.reduceByKey(accumulate_sentiment)

In [None]:
m.take(10)

[('jarvis', (-29, 135)),
 ('tony stark', (-156, 1788)),
 ('bruce banner', (-113, 379)),
 ('thanos', (-11, 109)),
 ('vision', (-2, 76)),
 ('loki', (-11, 327)),
 ('pepper potts', (-58, 442)),
 ('hulk', (0, 44)),
 ('peter parker', (-9, 551)),
 ('steve rogers', (-147, 977))]

In [None]:
# Implemente e aplique um método para calculo do sentimento médio
def mean_sentiment(v):
  acc_sentiment = v[0]
  acc_count = v[1]
  return round(acc_sentiment/acc_count,1)


In [None]:
medias = m.mapValues(mean_sentiment)

In [None]:
medias.take(10)

[('jarvis', -0.2),
 ('tony stark', -0.1),
 ('hulk', 0.0),
 ('peter parker', -0.0),
 ('steve rogers', -0.2),
 ('thanos', -0.1),
 ('bruce banner', -0.3),
 ('vision', -0.0),
 ('loki', -0.0),
 ('pepper potts', -0.1)]

In [None]:
# Verificando se temos 10 entradas (10 personagens)
medias.count()

10

# Resultado Final


Apresente o resultado final da sua análise completa.

In [None]:
# Imprimir resultado final
medias.collect()

[('jarvis', -0.2),
 ('tony stark', -0.1),
 ('thanos', -0.1),
 ('bruce banner', -0.3),
 ('vision', -0.0),
 ('loki', -0.0),
 ('pepper potts', -0.1),
 ('hulk', 0.0),
 ('peter parker', -0.0),
 ('steve rogers', -0.2)]