In [1]:
from pyarrow import fs
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, trim, lower, regexp_replace

In [2]:
spark = SparkSession.builder.appName("Twitter Analysis").getOrCreate()

In [3]:
arquivo_csv_local = '/Users/ERICA/Desktop/jupyter/sentiment140.csv'
df_spark = spark.read.csv(arquivo_csv_local, header=True, inferSchema=True, sep=",")
df_spark.show()

+-----------------+--------------------+------------+-------------------+--------------------+--------------------+
|           target|                 ids|        date|               flag|                user|      text;;;;;;;;;;|
+-----------------+--------------------+------------+-------------------+--------------------+--------------------+
|"0,""1467810369""|""Mon Apr 06 22:1...|""NO_QUERY""|""_TheSpecialOne_""|""@switchfoot htt...| that's a bummer....|
|"0,""1467810672""|""Mon Apr 06 22:1...|""NO_QUERY""|  ""scotthamilton""|""is upset that h...|                NULL|
|"0,""1467810917""|""Mon Apr 06 22:1...|""NO_QUERY""|       ""mattycus""|""@Kenichan I div...|                NULL|
|"0,""1467811184""|""Mon Apr 06 22:1...|""NO_QUERY""|        ""ElleCTF""|""my whole body f...|                NULL|
|"0,""1467811193""|""Mon Apr 06 22:1...|""NO_QUERY""|         ""Karoli""|""@nationwideclas...| it's not behavin...|
|"0,""1467811372""|""Mon Apr 06 22:2...|""NO_QUERY""|       ""joy_wolf""

In [4]:
df_spark = df_spark.withColumnRenamed('text;;;;;;;;;;', 'text')
df_spark.show()

+-----------------+--------------------+------------+-------------------+--------------------+--------------------+
|           target|                 ids|        date|               flag|                user|                text|
+-----------------+--------------------+------------+-------------------+--------------------+--------------------+
|"0,""1467810369""|""Mon Apr 06 22:1...|""NO_QUERY""|""_TheSpecialOne_""|""@switchfoot htt...| that's a bummer....|
|"0,""1467810672""|""Mon Apr 06 22:1...|""NO_QUERY""|  ""scotthamilton""|""is upset that h...|                NULL|
|"0,""1467810917""|""Mon Apr 06 22:1...|""NO_QUERY""|       ""mattycus""|""@Kenichan I div...|                NULL|
|"0,""1467811184""|""Mon Apr 06 22:1...|""NO_QUERY""|        ""ElleCTF""|""my whole body f...|                NULL|
|"0,""1467811193""|""Mon Apr 06 22:1...|""NO_QUERY""|         ""Karoli""|""@nationwideclas...| it's not behavin...|
|"0,""1467811372""|""Mon Apr 06 22:2...|""NO_QUERY""|       ""joy_wolf""

In [5]:
df_consulta = df_spark.withColumn('token', split(lower(trim(col('text'))), r"[^@#a-zA-Z0-9'-]+"))
df_consulta = df_consulta.withColumn('token', explode(col('token')))
df_consulta = df_consulta.filter(col('token') != '')

In [6]:
df_palavras = df_consulta.groupBy('token').count().orderBy('count', ascending=False)
print('Palavras mais usadas:')
df_palavras.show(20)

Palavras mais usadas:
+-----+-----+
|token|count|
+-----+-----+
|    i|19827|
|   to|13271|
|  the|11153|
|  and| 8065|
|    a| 7895|
|  but| 6898|
|   my| 6838|
|   it| 6736|
|   in| 4799|
|   is| 4421|
|  you| 4343|
|  for| 4309|
|   of| 4032|
| have| 4000|
|   me| 3889|
|   so| 3714|
|  not| 3607|
|   on| 3401|
|  i'm| 3298|
| that| 3256|
+-----+-----+
only showing top 20 rows



In [7]:
df_usuarios_mencionados = df_palavras.filter(col('token').startswith('@')).filter(col('token') != '@').orderBy('count', ascending=False)
print('Usuários mais mencionados:')
df_usuarios_mencionados.show(20)

Usuários mais mencionados:
+----------------+-----+
|           token|count|
+----------------+-----+
|        @twitter|    9|
|     @dannymcfly|    4|
|    @dontflashme|    4|
|          @reply|    3|
|@jonathanrknight|    3|
|              @-|    3|
|        @replies|    3|
|    @davidarchie|    3|
|       @ddlovato|    3|
|     @nicohilton|    3|
|     @markhoppus|    2|
| @evaangelinaxxx|    2|
|            @the|    2|
|      @starbucks|    2|
|      @katyperry|    2|
|       @msteagan|    2|
|        @anneeee|    2|
|     @chloemcfly|    2|
|  @androidtomato|    2|
|     @shankargan|    2|
+----------------+-----+
only showing top 20 rows



In [8]:
df_hashtags = df_palavras.filter(col('token').startswith('#')).filter(col('token') != '#').orderBy('count', ascending=False)
print('Hashtags mais frequentes:')
df_hashtags.show(20)

Hashtags mais frequentes:
+---------------+-----+
|          token|count|
+---------------+-----+
|            #fb|   42|
|          #fail|   13|
|             #1|   10|
|       #asot400|   10|
|           #bgt|    9|
|      #mmwanted|    8|
|          #ontd|    7|
|             #2|    6|
|          #bck5|    6|
|    #fixreplies|    6|
|      #saveearl|    6|
|  #followfriday|    5|
|         #asylm|    5|
|            #f1|    4|
|          #lost|    4|
|    #delongeday|    4|
|#beatwittyparty|    4|
|        #iphone|    3|
|#pussycatisland|    3|
|      #pawpawty|    3|
+---------------+-----+
only showing top 20 rows

