## Dependências

In [1]:
!pip install pyspark
# !pip install nltk
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Inicialização

In [2]:
import warnings
warnings.filterwarnings('ignore')

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

# Desconecta o SparkContext, caso esteja conectado
#sc.stop()

# Opção 1
conf = SparkConf().setMaster("local").setAppName("tp3")

# Opção 2 
#conf = SparkConf().setMaster("spark://10.208.205.1:7077").setAppName("UFAM-Lab1")


# Conecta ao Cluster Spark
sc = SparkContext.getOrCreate(conf = conf) 

# Define um "entry point" para toda as operação SPARK SQL
sqlc = SQLContext(sc)

In [3]:
# import pandas as pd
# from pyspark import *
# from pyspark.sql import *

# spark = SparkSession.builder.appName('tweet-SA').getOrCreate()
# pd.set_option('display.max_columns', None)
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('max_colwidth', None)

## Leitura dos dados

In [4]:
!wget https://github.com/erlonbie/twitter-sentiment-analysis-kmeans-pyspark/raw/main/train.csv

--2023-02-01 18:05:23--  https://github.com/erlonbie/twitter-sentiment-analysis-kmeans-pyspark/raw/main/train.csv
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erlonbie/twitter-sentiment-analysis-kmeans-pyspark/main/train.csv [following]
--2023-02-01 18:05:24--  https://raw.githubusercontent.com/erlonbie/twitter-sentiment-analysis-kmeans-pyspark/main/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3103165 (3.0M) [text/plain]
Saving to: ‘train.csv.1’


2023-02-01 18:05:24 (104 MB/s) - ‘train.csv.1’ saved [3103165/3103165]



In [5]:
cases = sqlc.read.load("train.csv",
                       format="csv", 
                       sep=",", 
                       inferSchema="true", 
                       header="true")
cases.show()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
|  6|    0|[2/2] huge fan fa...|
|  7|    0| @user camping to...|
|  8|    0|the next school y...|
|  9|    0|we won!!! love th...|
| 10|    0| @user @user welc...|
| 11|    0| â #ireland con...|
| 12|    0|we are so selfish...|
| 13|    0|i get to see my d...|
| 14|    1|@user #cnn calls ...|
| 15|    1|no comment!  in #...|
| 16|    0|ouch...junior is ...|
| 17|    0|i am thankful for...|
| 18|    1|retweet if you ag...|
| 19|    0|its #friday! ð...|
| 20|    0|as we all know, e...|
+---+-----+--------------------+
only showing top 20 rows



## Tratamento dos dados

In [6]:
cases = cases.fillna({'tweet':''})

In [7]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol='tweet', outputCol='tokens')
cases = tokenizer.transform(cases)

In [8]:
cases.show()

+---+-----+--------------------+--------------------+
| id|label|               tweet|              tokens|
+---+-----+--------------------+--------------------+
|  1|    0| @user when a fat...|[, @user, when, a...|
|  2|    0|@user @user thank...|[@user, @user, th...|
|  3|    0|  bihday your maj...|[, , bihday, your...|
|  4|    0|#model   i love u...|[#model, , , i, l...|
|  5|    0| factsguide: soci...|[, factsguide:, s...|
|  6|    0|[2/2] huge fan fa...|[[2/2], huge, fan...|
|  7|    0| @user camping to...|[, @user, camping...|
|  8|    0|the next school y...|[the, next, schoo...|
|  9|    0|we won!!! love th...|[we, won!!!, love...|
| 10|    0| @user @user welc...|[, @user, @user, ...|
| 11|    0| â #ireland con...|[, â, #ireland,...|
| 12|    0|we are so selfish...|[we, are, so, sel...|
| 13|    0|i get to see my d...|[i, get, to, see,...|
| 14|    1|@user #cnn calls ...|[@user, #cnn, cal...|
| 15|    1|no comment!  in #...|[no, comment!, , ...|
| 16|    0|ouch...junior is 

In [9]:
# rdd = cases.rdd.map(lambda x: (x['id'], x['label'], x['tweet'], list(filter(lambda x: x != '', x['tokens']))))
# cases = rdd.toDF(cases.columns)
# cases.show()

## TF-IDF 

In [10]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

## K-means 

In [11]:
# KMeans Clustering 

from pyspark.ml.clustering import KMeans

numIterations = 200 
numberClusters = 2
kmeans = KMeans().setMaxIter(numIterations).setK(numberClusters).setSeed(1).setDistanceMeasure('cosine') 

## Versão 1

In [12]:
from pyspark.sql.functions import monotonically_increasing_id,filter as filter_spark, size, transform

cases_v1 = cases
cases_v1 = cases_v1.withColumn('tokens', filter_spark('tokens', lambda x: ~x.contains('#')))
cases_v1 = cases_v1.where(size(cases_v1.tokens) != 0)
cases_v1.show()

+---+-----+--------------------+--------------------+
| id|label|               tweet|              tokens|
+---+-----+--------------------+--------------------+
|  1|    0| @user when a fat...|[, @user, when, a...|
|  2|    0|@user @user thank...|[@user, @user, th...|
|  3|    0|  bihday your maj...|[, , bihday, your...|
|  4|    0|#model   i love u...|[, , i, love, u, ...|
|  5|    0| factsguide: soci...|[, factsguide:, s...|
|  6|    0|[2/2] huge fan fa...|[[2/2], huge, fan...|
|  7|    0| @user camping to...|[, @user, camping...|
|  8|    0|the next school y...|[the, next, schoo...|
|  9|    0|we won!!! love th...|[we, won!!!, love...|
| 10|    0| @user @user welc...|[, @user, @user, ...|
| 11|    0| â #ireland con...|[, â, consumer,...|
| 12|    0|we are so selfish...|[we, are, so, sel...|
| 13|    0|i get to see my d...|[i, get, to, see,...|
| 14|    1|@user #cnn calls ...|[@user, calls, mi...|
| 15|    1|no comment!  in #...|[no, comment!, , ...|
| 16|    0|ouch...junior is 

In [13]:
# rdd = cases.rdd.map(lambda x:(x['id'], x['label'], x['tweet'], list(filter(lambda x: '#' not in x, x['tokens']))))
# cases_v1 = rdd.toDF(cases.columns)
# cases_v1.show()

In [14]:
cases_v1_model = hashingTF.transform(cases_v1)
idfModel = idf.fit(cases_v1_model)
cases_v1_model = idfModel.transform(cases_v1_model)
cases_v1_model.limit(5).toPandas()

Unnamed: 0,id,label,tweet,tokens,rawFeatures,features
0,1,0,@user when a father is dysfunctional and is s...,"[, @user, when, a, father, is, dysfunctional, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 5.229950865266344, 0.0, 0..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[@user, @user, thanks, for, credit, i, can't, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,0,bihday your majesty,"[, , bihday, your, majesty]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,0,#model i love u take with u all the time in ...,"[, , i, love, u, take, with, u, all, the, time...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,5,0,factsguide: society now #motivation,"[, factsguide:, society, now, , , ]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
kmeans_model = kmeans.fit(cases_v1_model)
predictions_v1 = kmeans_model.transform(cases_v1_model)
predictions_v1.select('id','tweet','label','prediction').limit(5).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,3,bihday your majesty,0,0
3,4,#model i love u take with u all the time in ...,0,0
4,5,factsguide: society now #motivation,0,0


In [16]:
predictions_v1.select('id','tweet','label','prediction').limit(100).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,3,bihday your majesty,0,0
3,4,#model i love u take with u all the time in ...,0,0
4,5,factsguide: society now #motivation,0,0
...,...,...,...,...
95,96,@user i'll always hope that one day i'll get t...,0,1
96,97,#model i love u take with u all the time in ...,0,0
97,98,couple having sex fat naked japanese girls,0,0
98,99,#hump on that #hump day #humpersð© @ edwa...,0,0


In [17]:
# ## Converter para dataframe Pandas
# import pandas as pd

# cases.limit(20).toPandas()

In [18]:
# df_teste = cases.limit(20).toPandas()

In [19]:
# df_teste

In [20]:
# import re
# a1 = "we won!!! love the land!!! #allin #cavs #champ"
# a2 =" ".join(filter(lambda x:x[0]!='#', a1.split()))
# print(a2)

In [21]:
# def filtra(x):
#     return " ".join(filter(lambda y:y[0]!='#', x.split()))


In [22]:
# df_teste['tweet'] = df_teste['tweet'].apply(filtra)

In [23]:
# df_teste

In [24]:
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# set(stopwords.words('english'))

In [25]:
# stop = stopwords.words('english')

In [26]:
# def stop_words(df):
#     df['stopwords'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
#     print(df[['tweet','stopwords']].head())

In [27]:
# stop_words(cases.toPandas())

## Versão 2

In [28]:
cases_v2 = cases
cases_v2 = cases_v2.withColumn('tokens', filter_spark('tokens', lambda x: x.contains('#')))
cases_v2 = cases_v2.where(size(cases_v2.tokens) != 0)
cases_v2.show()

+---+-----+--------------------+--------------------+
| id|label|               tweet|              tokens|
+---+-----+--------------------+--------------------+
|  1|    0| @user when a fat...|              [#run]|
|  2|    0|@user @user thank...|[#lyft, #disapoin...|
|  4|    0|#model   i love u...|            [#model]|
|  5|    0| factsguide: soci...|       [#motivation]|
|  6|    0|[2/2] huge fan fa...|   [#allshowandnogo]|
|  8|    0|the next school y...|[#school, #exams,...|
|  9|    0|we won!!! love th...|[#allin, #cavs, #...|
| 10|    0| @user @user welc...|              [#gr8]|
| 11|    0| â #ireland con...|[#ireland, #blog,...|
| 12|    0|we are so selfish...|[#orlando, #stand...|
| 13|    0|i get to see my d...|[#80days, #gettin...|
| 14|    1|@user #cnn calls ...|[#cnn, #michigan,...|
| 15|    1|no comment!  in #...|[#australia, #opk...|
| 16|    0|ouch...junior is ...|[angryð#got7, ...|
| 17|    0|i am thankful for...|[#thankful, #posi...|
| 19|    0|its #friday! ð

In [29]:
cases_v2_model = hashingTF.transform(cases_v2)
idfModel = idf.fit(cases_v2_model)
cases_v2_model = idfModel.transform(cases_v2_model)
cases_v2_model.limit(5).toPandas()

Unnamed: 0,id,label,tweet,tokens,rawFeatures,features
0,1,0,@user when a father is dysfunctional and is s...,[#run],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[#lyft, #disapointed, #getthanked]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,0,#model i love u take with u all the time in ...,[#model],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5,0,factsguide: society now #motivation,[#motivation],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,6,0,[2/2] huge fan fare and big talking before the...,[#allshowandnogo],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [30]:
kmeans_model_v2 = kmeans.fit(cases_v2_model)
predictions_v2 = kmeans_model_v2.transform(cases_v2_model)
predictions_v2.select('id','tweet','label','prediction').limit(5).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,4,#model i love u take with u all the time in ...,0,0
3,5,factsguide: society now #motivation,0,0
4,6,[2/2] huge fan fare and big talking before the...,0,0


In [31]:
predictions_v2.select('id','tweet','label','prediction').limit(100).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,4,#model i love u take with u all the time in ...,0,0
3,5,factsguide: society now #motivation,0,0
4,6,[2/2] huge fan fare and big talking before the...,0,0
...,...,...,...,...
95,128,sad in the branches itâs just rainy day writ...,0,0
96,129,yeah! new buttons in the mail for me ð the...,0,0
97,130,@user driver hit female moose on river rd #we...,0,0
98,131,afterpas. made in japan. ï¼madeinjapan #eo...,0,0


## Versão 3

In [43]:
cases_v3 = cases
cases_v3.show()

+---+-----+--------------------+--------------------+
| id|label|               tweet|              tokens|
+---+-----+--------------------+--------------------+
|  1|    0| @user when a fat...|[, @user, when, a...|
|  2|    0|@user @user thank...|[@user, @user, th...|
|  3|    0|  bihday your maj...|[, , bihday, your...|
|  4|    0|#model   i love u...|[#model, , , i, l...|
|  5|    0| factsguide: soci...|[, factsguide:, s...|
|  6|    0|[2/2] huge fan fa...|[[2/2], huge, fan...|
|  7|    0| @user camping to...|[, @user, camping...|
|  8|    0|the next school y...|[the, next, schoo...|
|  9|    0|we won!!! love th...|[we, won!!!, love...|
| 10|    0| @user @user welc...|[, @user, @user, ...|
| 11|    0| â #ireland con...|[, â, #ireland,...|
| 12|    0|we are so selfish...|[we, are, so, sel...|
| 13|    0|i get to see my d...|[i, get, to, see,...|
| 14|    1|@user #cnn calls ...|[@user, #cnn, cal...|
| 15|    1|no comment!  in #...|[no, comment!, , ...|
| 16|    0|ouch...junior is 

In [44]:
cases_v3_model = hashingTF.transform(cases_v3)
idfModel = idf.fit(cases_v3_model)
cases_v3_model = idfModel.transform(cases_v3_model)
cases_v3_model.limit(5).toPandas()

Unnamed: 0,id,label,tweet,tokens,rawFeatures,features
0,1,0,@user when a father is dysfunctional and is s...,"[, @user, when, a, father, is, dysfunctional, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 5.044458094019535, 0.0, 0..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[@user, @user, thanks, for, #lyft, credit, i, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,0,bihday your majesty,"[, , bihday, your, majesty]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,0,#model i love u take with u all the time in ...,"[#model, , , i, love, u, take, with, u, all, t...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,5,0,factsguide: society now #motivation,"[, factsguide:, society, now, , , , #motivation]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [45]:
kmeans_model = kmeans.fit(cases_v3_model)
predictions_v3 = kmeans_model.transform(cases_v3_model)
predictions_v3.select('id','tweet','label','prediction').limit(5).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,3,bihday your majesty,0,1
3,4,#model i love u take with u all the time in ...,0,0
4,5,factsguide: society now #motivation,0,0


In [46]:
predictions_v1.select('id','tweet','label','prediction').limit(100).toPandas()

Unnamed: 0,id,tweet,label,prediction
0,1,@user when a father is dysfunctional and is s...,0,0
1,2,@user @user thanks for #lyft credit i can't us...,0,0
2,3,bihday your majesty,0,0
3,4,#model i love u take with u all the time in ...,0,0
4,5,factsguide: society now #motivation,0,0
...,...,...,...,...
95,96,@user i'll always hope that one day i'll get t...,0,1
96,97,#model i love u take with u all the time in ...,0,0
97,98,couple having sex fat naked japanese girls,0,0
98,99,#hump on that #hump day #humpersð© @ edwa...,0,0
