# Suite et fin du TME6: Classification de sentiments

In [43]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import string
import unicodedata
import re
import codecs
import nltk.corpus.reader as pt

## Preprocessing

In [44]:
path1='data/movies1000/pos'
path2='data/movies1000/neg'
rdr1 = pt.CategorizedPlaintextCorpusReader(path1, r'.*\.txt', cat_pattern=r'(.*)\.txt')
rdr2 = pt.CategorizedPlaintextCorpusReader(path2, r'.*\.txt', cat_pattern=r'(.*)\.txt')

def make_training_data(rdr):
    for c in rdr.categories():
        for f in rdr.fileids(c):
            yield rdr.raw(fileids=[f])

### Train

In [107]:
docs1=list(make_training_data(rdr1))
y1=[1 for i in range(len(docs1))]
docs2=list(make_training_data(rdr2))
y2=[-1 for i in range(len(docs2))]
X_str=docs1+docs2
y=y1+y2

In [108]:
from nltk import stem

stopw=readAFile('stopwords.txt')
stopw=stopw.split()

def process(txt,stopw=None):
    #txt = txt[txt.find("\n\n"):] # elimination de l'entete (on ne conserve que les caractères après la première occurence du motif
    txt = unicodedata.normalize("NFKD",txt).encode("ascii","ignore") # elimination des caractères spéciaux, accents...
    punc = string.punctuation    # recupération de la ponctuation
    punc += u'\n\r\t\\'          # ajouts de caractères à enlever
    table =string.maketrans(punc, ' '*len(punc))  # table de conversion punc -> espace
    txt = string.translate(txt,table).lower() # elimination des accents + minuscules
    
    #stemming
    txt_list=txt.split()
    snowball = stem.snowball.EnglishStemmer()
    txt_list=[snowball.stem(w) for w in txt_list]
    if(stopw):
        txt_list=[w for w in txt_list if(w not in stopw)]
    txt=' '.join(txt_list)
    
    return txt

X_str=[process(x_str) for x_str in X_str]

### Test

In [109]:
def readAFile(nf):
    f = open(nf, 'rb')
    l = []
    txt = f.readlines()
    for i in txt:
        l.append(i.decode("utf-8"))
    f.close()
    return ' '.join(l)

path_test='data/movies1000/test/'
docs_test = readAFile(path_test+"testSentiment.txt")
docs_test = docs_test.split('\n')[0:-1]
docs_test=[process(doc_test) for doc_test in docs_test]

## Modèles

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

countVe = TfidfVectorizer(max_df=0.55, min_df=1, #max_features=1000,
                    )
count = countVe.fit_transform(X_str)
X=count

In [116]:
from sklearn import svm
C = 2.6  # SVM regularization parameter
nb = svm.LinearSVC(C=C,max_iter=9000).fit(X, y)

In [117]:
count_test = countVe.transform(docs_test)
pred_labels=nb.predict(count_test)

In [118]:
print len(pred_labels)

25000


In [119]:
f = open('sentim.txt', 'w')
for i in pred_labels:
    if i == -1:
        f.write('C\n')
    else:
        f.write('M\n')
f.close()

In [211]:
w=nb.coef_[0].argsort()

neg=np.array(w[:10])
pos=np.array(w[-10:])
test=np.zeros((len(w)))
test[pos]=1
ic=countVe.inverse_transform(test)
print ic

[array([u'enjoy', u'fun', u'great', u'hilari', u'job', u'matrix',
       u'perfect', u'perform', u'surpris', u'well'], 
      dtype='<U25')]


# TME8: word2vec

In [237]:
import gensim
import os

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

path1='data/movies1000/all'
sentences = MySentences(path1) # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences)

In [241]:
model.most_similar(positive=['weekend',], negative=['sunday'], topn=10)

[('season.', 0.646028459072113),
 ('night!', 0.6455487012863159),
 ('theater.', 0.6366403102874756),
 ('week.', 0.6299988031387329),
 ('weekend.', 0.6183199286460876),
 ('miniseries.', 0.6166147589683533),
 ('evening.', 0.614005982875824),
 ('day.', 0.6085823178291321),
 ("'70s.", 0.6062525510787964),
 ('list.', 0.5956918001174927)]