# Соревнование по сентимент-анализу

## Сентимент-анализ отзывов на товары (простая версия)

In [9]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import nltk
import re
import string
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lera\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

#### Список стоп-слов:

In [10]:
stop_words = nltk.corpus.stopwords.words('english')

#### Загрузим данные:

In [11]:
train = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'target'])
test = pd.read_csv('products_sentiment_test.tsv', sep='\t', index_col='Id')

In [12]:
sample = pd.read_csv('products_sentiment_sample_submission.csv')

In [13]:
train.text.head()

0            2 . take around 10,000 640x480 pictures .
1    i downloaded a trial version of computer assoc...
2    the wrt54g plus the hga7t is a perfect solutio...
3    i dont especially like how music files are uns...
4    i was using the cheapie pail ... and it worked...
Name: text, dtype: object

#### Функция для преобразования текста отзыва. Удаляем все символы пунктуации, слова с цифрами, производим лемматизацию:

In [14]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [15]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

regex = re.compile('[%s]' % re.escape(string.punctuation))

def filt(sent):
    sent = regex.sub('', sent)
    words = [x.strip() for x in sent.split() if not hasNumbers(x)]
    result = []
    for word in words:
        word = wordnet_lemmatizer.lemmatize(word)
        if word in stop_words:
            continue
        result.append(word)
    return ' '.join(result)

In [16]:
train['text_filt'] = train.text.apply(filt)

In [17]:
test['text_filt'] = test.text.apply(filt)

#### Выберем классификатор:

In [18]:
def score(clf):
    scores = cross_val_score(clf, train.text_filt, train.target, cv=5)
    print("CLF score mean = {:.5f}, std = {:.5f}".format(scores.mean(), scores.std()))

In [19]:
for classf in [LogisticRegression, SGDClassifier, LinearSVC, MultinomialNB]:
    clf = make_pipeline(CountVectorizer(ngram_range=(1,1), stop_words='english'), classf())
    score(clf)

CLF score mean = 0.75198, std = 0.01793
CLF score mean = 0.71750, std = 0.01544




CLF score mean = 0.73398, std = 0.01360
CLF score mean = 0.75448, std = 0.01255


#### Посмотрим на предложения, которые мы плохо определяем:

In [20]:
from sklearn.cross_validation import train_test_split

ttrain, ttest = train_test_split(train)



In [21]:
clf = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
clf.fit(ttrain.text_filt, ttrain.target)
predictions = clf.predict(ttest.text_filt)

for x in ttest[ttest.target != predictions].iterrows():
    print("{}: {}".format(x[1]['target'], x[1]['text']))

1: their website had a download that helped remove those , although i had to manually remove a good number of files from the " programs " directory .
1: by the way , i use the norton 2003 corporate at work with absolutely no trouble .
1: multimedia functions are enhanced by multi-slide mms messages and a gallery for organizing digital content captured or downloaded with the phone .
0: i 'm giving it 1 star because as compared to other nokia phones , its a 1 , compared to other blunders by other companies ( such a samsung and motorrola ) it might be good .... but if you 're getting a nokia , don 't get this one .
1: less than a cheap cordless drill or a laminate trimmer .
0: i have read the installation instructions for both nis 2004 and nav 2004 prior to installation , but still ended up with the same result ... junk software .
0: the quick sync is n't any good becuase it either needs to be configured properly or just is n't very useful . 
0: this player is not worth any price and i re

#### Обучим классификатор:

In [22]:
clf = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
clf.fit(train.text_filt, train.target)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

#### Сохраним предсказания:

In [23]:
predictions = clf.predict(test.text_filt)

In [24]:
out = pd.DataFrame(predictions, index=test.index, columns=['y'])

In [25]:
out.to_csv('submission.csv')