**Programa Para o Twitter***

In [31]:
#cria a seção a ser utiliza para estabelecer a conexão 
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

spark = SparkSession \
    .builder \
    .appName("StructuredNetworkTwitterV02") \
    .getOrCreate()

In [32]:
#bibliotecas utilizadas para realizar a análise dos textos
from textblob import TextBlob  #utilizada para realizar o processamento do texto e análise de sentimento
from googletrans import Translator #utilizado para traduzir textos 
from unidecode import unidecode  #utilizada para "decodificar caracteres" não textuais

In [33]:
#cria o dataframe que será responsável por ler cada uma das linhas recebidas através do localhost e porta 9999
# define a fonte (source) de dados
twitters = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9995) \
    .load()

23/01/27 12:48:54 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [34]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, FloatType

In [35]:
#função para realizar a tradução para o inglês 
def translate_udf(col):
    trans_obj=Translator().translate(col)
    return trans_obj.text

#função para realizar a análise de sentimento
def sentiment_udf(col):
    sentiment_text=TextBlob(col)
    return sentiment_text.polarity


In [36]:
#definição das função como User-Defined-Function
unicode_udf_string = udf(lambda z: unidecode(z), StringType()) #define a função de decode para ser utilizada no dataframe
group_by_sentiment = udf(lambda x: 'negativo' if x < -0.1 else 'positivo' if x > 0.1 else 'neutro',StringType())
translate_udf_string = udf(translate_udf, StringType()) #define a função de tradução
sentiment_udf_float = udf(sentiment_udf, FloatType()) #define a função de tradução

In [37]:
#testando funções
teste="Eu ♥ o meu cachorro, ele é o meu melhor amigo"
decode=unidecode(teste)
print(decode)
decodeEN=Translator().translate(decode)
print(decodeEN.text)
a=str(decodeEN)
sentiment = TextBlob(a)
print(sentiment.polarity)

Eu  o meu cachorro, ele e o meu melhor amigo
Me my dog, he is my best friend
1.0


In [38]:
#Aplica as funções udf para a seleção de colunas
twitters_unicode=twitters.select( "value",unicode_udf_string(twitters.value).alias("unicoded")) #decodifica
twitters_uni_trans=twitters_unicode.select( "value","unicoded",translate_udf_string(col("unicoded")).alias("twitter_EN")) #traduz
twitters_uni_trans_sent=twitters_uni_trans.select("value", "unicoded", "twitter_EN", sentiment_udf_float(col("twitter_EN")).alias("analise"))#análise de sentimento
t_sent_label= twitters_uni_trans_sent.select("value","unicoded","twitter_EN", "analise",group_by_sentiment(col("analise")).alias("classificacao"))

In [39]:
t_sent_count = t_sent_label.groupBy("classificacao").count() 

In [40]:
# Define a consulta (query) e como deve ser realizada a saída (sink) para o stream criado 
query = t_sent_count \
    .writeStream \
    .outputMode("update") \
    .format("console") \
    .start()

query.awaitTermination() #aguarda até que a "streaming query" termine 

23/01/27 12:48:55 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/hn/zqmlf1cd4s3_llkx270x0kw80000gn/T/temporary-e8777d58-9ec0-4984-b9db-73f71fbff30b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/27 12:48:55 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/27 12:48:55 WARN TextSocketMicroBatchStream: Stream closed by localhost:9995
23/01/27 12:48:55 ERROR Executor: Exception in task 5.0 in stage 6.0 (TID 29)
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:599)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonf

objc[7048]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[7048]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
objc[7047]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[7047]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
objc[7049]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[7049]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely cal

StreamingQueryException: Query [id = f48eba82-acf4-4235-9eda-c96701907a44, runId = 8a74b939-66b2-40cd-bdf1-76a51689ecd0] terminated with exception: Writing job aborted

23/01/27 12:48:55 WARN TaskSetManager: Lost task 1.0 in stage 6.0 (TID 25) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)
23/01/27 12:48:55 WARN TaskSetManager: Lost task 7.0 in stage 6.0 (TID 31) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)
23/01/27 12:48:56 WARN TaskSetManager: Lost task 4.0 in stage 6.0 (TID 28) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)


[Stage 6:>                                                          (0 + 6) / 8]

23/01/27 12:48:56 WARN TaskSetManager: Lost task 3.0 in stage 6.0 (TID 27) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)
23/01/27 12:48:56 WARN TaskSetManager: Lost task 6.0 in stage 6.0 (TID 30) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)
23/01/27 12:48:56 WARN TaskSetManager: Lost task 2.0 in stage 6.0 (TID 26) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)


[Stage 6:>                                                          (0 + 3) / 8]

23/01/27 12:48:56 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 24) (192.168.68.104 executor driver): TaskKilled (Stage cancelled)


[Stage 6:>                                                          (0 + 1) / 8]

23/01/27 18:16:05 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1049801 ms exceeds timeout 120000 ms
23/01/27 18:16:05 WARN SparkContext: Killing executors is not supported by current scheduler.
