![CMCC](http://cmcc.ufabc.edu.br/images/logo_site.jpg)

# **Representando atributos textuais por quantização**


### Faça upload do arquivo moviereviews.tsv (no site do curso) no mesmo diretório desse notebook

In [80]:
import os
import numpy as np

def parseRDD(point):
    """ Parser for the current dataset. It receives a data point and return
        a sentence (third field).
    Args:
        point (str): input data point
    Returns:
        str: a string
    """    
    data = point.split('\t')
    return (int(data[1]),data[2])

def notempty(point):
    """ Returns whether the point string is not empty
    Args:
        point (str): input string
    Returns:
        bool: True if it is not empty
    """   
    return len(point[1])>0

filename = os.path.join("moviereviews.tsv")
rawRDD = sc.textFile(filename,100)
header = rawRDD.take(1)[0]

dataRDD = (rawRDD
           #.sample(False, 0.1, seed=42)
           .filter(lambda x: x!=header)
           .map(parseRDD)
           .filter(notempty)
           #.sample( False, 0.1, 42 )
           )

print 'Read {} lines'.format(dataRDD.count())
print 'Sample line: {}'.format(dataRDD.takeSample(False, 1)[0])

Read 4999 lines
Sample line: (1, u'"When I heard about \\"Hammerhead\\" being released on DVD and finally found it at my local DVD store, I thought \\"well, just another cheap monster movie from Nu Image\\". Those guys around Boaz Davidson and Avi Lerner produced cheap but very entertaining B - Pictures in the past few months but also some very disappointing movies. So I didn\'t expect much, especially after having watched the rather disappointing \\"Shark Zone\\" just a few days before. But \\"Hammerhead\\" turned out to be an excellent revival of the 1950s monster movies. We have a mad scientist, a group of people in a dangerous situation, screaming women and damsels in distress, man-eating plants and of course we have the creature, a huge mutant mix between a man and a hammerhead shark. Everything you need for an entertaining monster movie. The only thing missing are graphic sex scenes and nudity which you expect in movies of this kind, but since the movie was made for TV it\'s unde

In [81]:
import re

split_regex = r'\W+'

stopfile = os.path.join("Data","Aula04","stopwords.txt")
stopwords = set(sc.textFile(stopfile).collect())

def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    return filter(lambda x: len(x)>2 and x not in stopwords,re.split(split_regex,string.lower()))

wordsRDD = dataRDD.map(lambda x: tokenize(x[1]))

print wordsRDD.take(1)[0]

[u'stuff', u'going', u'moment', u'started', u'listening', u'music', u'watching', u'odd', u'documentary', u'watched', u'wiz', u'watched', u'moonwalker', u'maybe', u'want', u'get', u'certain', u'insight', u'guy', u'thought', u'really', u'cool', u'eighties', u'maybe', u'make', u'mind', u'whether', u'guilty', u'innocent', u'moonwalker', u'part', u'biography', u'part', u'feature', u'film', u'remember', u'going', u'see', u'cinema', u'originally', u'released', u'subtle', u'messages', u'feeling', u'towards', u'press', u'also', u'obvious', u'message', u'drugs', u'bad', u'kay', u'visually', u'impressive', u'course', u'michael', u'jackson', u'unless', u'remotely', u'like', u'anyway', u'going', u'hate', u'find', u'boring', u'may', u'call', u'egotist', u'consenting', u'making', u'movie', u'fans', u'would', u'say', u'made', u'fans', u'true', u'really', u'nice', u'actual', u'feature', u'film', u'bit', u'finally', u'starts', u'minutes', u'excluding', u'smooth', u'criminal', u'sequence', u'joe', u'pesc

In [82]:
# TODO: Replace <FILL IN> with appropriate code
from pyspark.mllib.feature import Word2Vec

model = Word2Vec().setVectorSize(5).setSeed(42).fit(wordsRDD)

print model.transform(u'entertaining')
print model.findSynonyms(u'entertaining', 2)

[-0.430164247751,0.0665539652109,-0.30026063323,0.304091185331,0.192403480411]
[(u'thrills', 0.99504232406616211), (u'dynamite', 0.98769634962081909)]


In [83]:
uniqueWords = (wordsRDD
               .flatMap(lambda x: [(w,1) for w in x])
               .reduceByKey(lambda x,y: x+y)
               .filter(lambda x: x[1]>=5)
               .map(lambda x: x[0])
               .collect()
               )

print '{} tokens únicos'.format(len(uniqueWords))

w2v = {}
for w in uniqueWords:
    w2v[w] = np.array(model.transform(w))
w2vb = sc.broadcast(w2v)       

vectorsRDD = (wordsRDD
              .map(lambda x: np.array([w2vb.value[w] for w in x if w in w2vb.value]))
             )
recs = vectorsRDD.take(2)
firstRec, secondRec = recs[0], recs[1]

12540 tokens únicos


### **Baseline**

In [122]:
# TODO: Replace <FILL IN> with appropriate code
from  pyspark.mllib.clustering import KMeans

ncluster = 200

vectors2RDD = sc.parallelize(np.array(w2v.values()),1)

modelK = KMeans.train(vectors2RDD, ncluster, seed=42 )

clustersRDD = vectors2RDD.map(lambda x: modelK.predict(x))

In [123]:
from pyspark.mllib.regression import LabeledPoint

def quantizador(point, model, k, w2v):
    key = point[0]
    words = tokenize(point[1])
    matrix = np.array( [w2v[w] for w in words if w in w2v] )
    features = np.zeros(k)
    for v in matrix:
        c = model.predict(v)
        features[c] += 1
    return LabeledPoint(key, features)
    
quantRDD = (dataRDD
            .map(lambda x: quantizador(x, modelK, ncluster, w2v))
            .filter(lambda x: x.label!=2)
            )

[LabeledPoint(1.0, [3.0,1.0,0.0,4.0,3.0,1.0,0.0,1.0,5.0,4.0,6.0,9.0,0.0,1.0,7.0,4.0,2.0,3.0,6.0,9.0,0.0,7.0,5.0,1.0,7.0,5.0,0.0,33.0,5.0,0.0,9.0,0.0,5.0,2.0,1.0,6.0,1.0,9.0,3.0,6.0,0.0,3.0,0.0,2.0,1.0,4.0,8.0,0.0,3.0,0.0])]


In [124]:
weights = [.8, .1, .1]
seed = 42
quantTrainData, quantValidationData, quantTestData = quantRDD.randomSplit(weights, seed)
# Cache the data
quantTrainData.cache()
quantValidationData.cache()
quantTestData.cache()

[LabeledPoint(1.0, [3.0,1.0,0.0,4.0,3.0,1.0,0.0,1.0,5.0,4.0,6.0,9.0,0.0,1.0,7.0,4.0,2.0,3.0,6.0,9.0,0.0,7.0,5.0,1.0,7.0,5.0,0.0,33.0,5.0,0.0,9.0,0.0,5.0,2.0,1.0,6.0,1.0,9.0,3.0,6.0,0.0,3.0,0.0,2.0,1.0,4.0,8.0,0.0,3.0,0.0])]


In [125]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.classification import SVMWithSGD, NaiveBayes

def calcAccuracy( predsAndVals ):
    return predsAndVals.map(lambda x: x[0]==x[1]).mean()

# fixed hyperparameters
numIters = 200
regParam = 0.5
regType = 'l2'
includeIntercept = True

In [126]:
modelLR = LogisticRegressionWithLBFGS.train(quantTrainData, iterations=numIters, 
                                        regParam=regParam, regType=regType, intercept=includeIntercept)
labelsAndPreds = quantValidationData.map(lambda x: (modelLR.predict(x.features),x.label))

accValLR = calcAccuracy(labelsAndPreds)
print  ('LR Validation Accuracy = {0:.3f}\n').format(accValLR)

LR Validation Accuracy = 0.642



### Projeto em Grupo

#### Aplique o seu algoritmo de clusterização no RDD `vectors2RDD` com `ncluster` grupos.
#### Aplique a função quantizador utilizando o seu modelo de clusterização na base dataRDD conforme código exemplo:

```
quantRDD = (dataRDD
            .map(lambda x: quantizador(x, modelK, ncluster, w2v))
            .filter(lambda x: x.label!=2)
            )
```

Note que `modelK` precisa ter um método `predict` que aloca um objeto a um cluster. Em seguida aplique a base de dados resultante na regressão logística conforme exemplo acima e calcule a acurácia. O valor deve ser parecido com o obtido acima.