# Massive Data Processing

## Spark practice

<b>Student: David Sánchez</b>


In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

### 1.-Practice: Trending Topics & analysis sentiment (5.0%)


#### Trending Topics

In [2]:
import os, shutil
import json
import unicodedata
from operator import add

numPartitions = 6
input = sc.textFile("../Datasets/Tweets/tweets_es.json", numPartitions)
tweets = input.map(lambda x: json.loads(x))
print "Num. partitions: ", tweets.getNumPartitions()
print "Total tweets: ", tweets.count()

tweets_es = tweets.filter(lambda t: "es" in t["lang"])
print "Spanish tweets: ", tweets_es.count()

tweets_es_hashtags = tweets_es.filter(lambda t: t["entities"]["hashtags"] != [] )
        
print  "Tweets with hashtags: ", tweets_es_hashtags.count()

hashtags = tweets_es_hashtags.flatMap(lambda t: map(lambda h: (unicodedata.normalize('NFKD', h["text"]).encode('ascii','ignore'),1), t["entities"]["hashtags"]))
print "Hashtags: ", hashtags.count()

trending_hashtags = hashtags.reduceByKey(lambda a, b: a + b)
print "Hashtags reduced: ", trending_hashtags.count()

if os.path.exists("../Results/Trending"): 
    shutil.rmtree("../Results/Trending")
trending_hashtags.saveAsTextFile('../Results/Trending')
print "Files saved: '../Results/Trending'"
print "Some examples: ", trending_hashtags.take(10)


Num. partitions:  6
Total tweets:  19166
Spanish tweets:  15028
Tweets with hashtags:  3495
Hashtags:  5286
Hashtags reduced:  2947
Files saved: '../Results/Trending'
Some examples:  [('VineDeLaAbuela', 1), ('', 1), ('RMUCL', 11), ('PorSiNoLoViste', 2), ('industria40', 1), ('wallapop', 1), ('ConMaduroAvanzamos', 3), ('DisenoGrafico', 1), ('candycrush', 1), ('ElClasico', 1)]


#### Top N

In [3]:
trending_sorted = trending_hashtags.takeOrdered(10, key=lambda t: -t[1])
print trending_sorted

if os.path.exists("../Results/TopN"): 
    shutil.rmtree("../Results/TopN")
sc.parallelize(trending_sorted).saveAsTextFile('../Results/TopN')
print "Files saved: '../Results/TopN'"

[('MTVMiaw', 196), ('Vota2ParaQueSigaPresidente', 117), ('FelizMartes', 84), ('TuitUtil', 65), ('LibertadDePrensa', 64), ('TeCaesTeLevantasFelices27Mica', 51), ('MTVSnapMexDanna', 39), ('MTVPopMBautista', 37), ('MtvIconoMBautista', 34), ('DebateReal', 34)]
Files saved: '../Results/TopN'


### Sentiment

In [4]:
# Positive and negative words

In [5]:
file_positive = sc.textFile("../Dictionary/positive_words_es.txt")
file_negative = sc.textFile("../Dictionary/negative_words_es.txt")
positive_words = file_positive.map(lambda w: w.encode('ascii', 'ignore'))
negative_words = file_negative.map(lambda w: w.encode('ascii', 'ignore'))
positive_words_list = positive_words.collect()
negative_words_list = negative_words.collect()

print "Positive words: ", positive_words.take(10), "..."
print "Negative words: ", negative_words.take(10), "..."

Positive words:  ['libera', 'como', 'gran', 'mayor', 'nuevo', 'general', 'obra', 'principal', 'bien', 'poco'] ...
Negative words:  ['divisas', 'en', 'para', 'sin', 'tiempo', 'bajo', 'varios', 'tipo', 'largo', 'solo'] ...


In [6]:
tweets_hashtags = tweets_es_hashtags.map(lambda t: (unicodedata.normalize('NFKD',t["text"]).encode('ascii','ignore').lower(),\
                                                (map(lambda h: unicodedata.normalize('NFKD',h["text"]).encode('ascii','ignore').lower(), \
                                                     t["entities"]["hashtags"]))))\
                                                .flatMapValues(lambda x: x)
print tweets_hashtags.take(2)

[('@disneyspain @tinistoessel libera logooo \n#tini', 'tini'), ('rt @amctv_es: el grupo esta separado... conseguiran escapar? llegaran a baja? el proximo lunes nuevo episodio! #feartwd\nhttps://t.co/vqw...', 'feartwd')]


In [7]:
def HashtagSentiment(tweet):
    
    positive_count = 0
    negative_count = 0
    hashtag = tweet[1]
    words = tweet[0].split(" ")
    length = len(words)
    for word in words:
        if word in positive_words_list:
            positive_count += 1
        elif word in negative_words_list:
            negative_count += 1
    
    return (hashtag, (length, positive_count, negative_count))

In [8]:
hashtags_base_sentim = tweets_hashtags.map(lambda t: HashtagSentiment(t))\
                .reduceByKey(lambda a, b:(a[0]+b[0], a[1]+b[1], a[2]+b[2]))
print "Hasghtag base sentiment info: "
print hashtags_base_sentim.take(10)

Hasghtag base sentiment info: 
[('', (14, 0, 0)), ('mtvmatchcastel', (14, 0, 0)), ('barranquilla', (10, 0, 0)), ('asslickers', (15, 1, 1)), ('industria40', (17, 0, 1)), ('mamafrigilux', (21, 0, 2)), ('afterclasswqradio', (12, 0, 1)), ('wallapop', (9, 0, 1)), ('luchojara', (18, 1, 2)), ('bautisters', (19, 2, 0))]


In [9]:
hashtags_sentim = hashtags_base_sentim.map(lambda h: (h[0],float(h[1][1]-h[1][2])/h[1][0]))
print "Hashtag sentiments: "
print hashtags_sentim.take(10)

Hashtag sentiments: 
[('', 0.0), ('mtvmatchcastel', 0.0), ('barranquilla', 0.0), ('asslickers', 0.0), ('industria40', -0.058823529411764705), ('mamafrigilux', -0.09523809523809523), ('afterclasswqradio', -0.08333333333333333), ('wallapop', -0.1111111111111111), ('luchojara', -0.05555555555555555), ('bautisters', 0.10526315789473684)]
