In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import movie_reviews, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import sklearn
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [2]:
# Функция пред-обработки текста:
# оставляем только слова и используем лемминг 
#
def text_prep(t_prep):
    t_train = []
    for i in range(len(t_prep)):
        tokens = word_tokenize(t_prep[i])
        tokens = [word.lower() for word in tokens if word.isalpha()]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        txt = ' '.join(tokens)
        t_train.append(txt)   
    return t_train


In [3]:
# Получаем значения X/y для обучения
df_train = pd.read_csv('products_sentiment_train.tsv', header=None, sep='\t')
df_train.head()

X_train = text_prep(df_train[0])
y_train = df_train[1].values

In [5]:
# Визуально оцениваем правильность исходных данных
print(X_train[0:5])
print(y_train[0:5])

['take around picture', 'i downloaded a trial version of computer associate ez firewall and antivirus and fell in love with a computer security system all over again', 'the plus the is a perfect solution if you need wireless coverage in a wider area or for a house a wa my case', 'i dont especially like how music file are unstructured basically they are just dumped into one folder with no organization like you might have in window explorer folder and subfolders', 'i wa using the cheapie pail and it worked ok until the opening device fell apart']
[1 1 1 0 1]


In [6]:
#================================Основная часть====================================================#
# несколько предварительных тестов показали что TfidfVectorizer совместно с LogisticRegression
# дает лучшие значения чем другие функции в данной задаче
#
#
pipe = make_pipeline(TfidfVectorizer(min_df=1), LogisticRegression())

# Задаем диапазон параметров
param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100],
"tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

# Выполняем поиск оптимальных параметров
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Наилучшее значение перекр проверки: {:.2f}".format(grid.best_score_))
print("Наилучшие параметры:\n{}".format(grid.best_params_))



Наилучшее значение перекр проверки: 0.79
Наилучшие параметры:
{'logisticregression__C': 100, 'tfidfvectorizer__ngram_range': (1, 3)}


In [7]:
# Подготавливаем тестовые значения
df_test = pd.read_csv('products_sentiment_test.tsv', sep='\t')

text_test = text_prep(df_test['text'])
print(text_test[0:5])

# Выполняем анализ
y = grid.predict(text_test)
print(y[0:5])

['so why the small digital elph rather than one of the other camera with better resolution or picture quality size because unless it small i won cary it around', 'way through the first disk we played on it naturally on day after purchase the dvd player froze', 'better for the zen micro is outlook compatibility', 'play gameboy color game on it with goboy', 'likewise i heard norton professional version is fine too']
[1 0 1 1 0]


In [8]:
# Экспорт данных для Kaggle
result = pd.DataFrame(data=y, columns=['y'])
result.index.name='Id'
result.to_csv('result.csv')