In [2]:
import numpy as np
import pandas as pd
import re
import bs4
import time
import requests
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Parsing

In [None]:
url_list = ['https://market.yandex.kz/catalog--smartfony-i-umnye-chasy-v-kapshagae/54437/list?hid=91461&show-reviews=1&page=' + str(n) for n in range(1, 100)]

In [None]:
final = pd.DataFrame(columns=['review', 'review_type', 'rate'])

for url in url_list:
	try:
		time.sleep(25)
		print(url)
		response = requests.get(url)
		soup = bs4.BeautifulSoup(response.text, 'lxml')
		reviews_types = soup.find_all('dt', attrs={'class':'n-product-review-item__title'})
		reviews_types = [reviews_type.text for reviews_type in reviews_types]
		print(len(reviews_types))
		reviews = soup.find_all('dd', attrs={'class':'n-product-review-item__text'})
		reviews = [review.text for review in reviews]
		print(len(reviews))
		ratings =  soup.find_all('div', attrs={'class':'rating__value'})
		ratings = [rating.text for rating in ratings]
		del ratings[::2]
		print(len(ratings))
		rates = []
		c = 0
		for i, t in enumerate(reviews_types):
			if t == ('Недостатки: '):
				rates.append('neg')
			elif t == ('Достоинства: '):
				rates.append('pos')
			elif float(ratings[c]) > 3:
				rates.append('pos')
				c += 1
			else:
				rates.append('neg')
				c += 1
		d = {'review': reviews, 'review_type': reviews_types, 'rate': rates}
		print(len(reviews), len(reviews_types), len(rates))
		data = pd.DataFrame(d, columns=['review', 'review_type', 'rate'])
		final = final.append(data, ignore_index=True)
	except:
		print('didnt')

# Test Data Upload

In [8]:
test = pd.read_csv('test.csv', encoding='utf-8', lineterminator='\t', sep='\n', header=None, names=['text'])

In [133]:
test.text[0]

'Ужасно слабый аккумулятор, это основной минус этого аппарата, разряжается буквально за пару часов при включенном wifi и на макс подсветке, например если играть или смотреть видео, следовательно использовать можно только если есть постоянная возможность подзарядиться. Качества звука через динамик далеко не на высоте.Наблюдаются незначительные тормоза в некоторых приложениях и вообще в меню. Очень мало встроенной памяти, а приложения устанавливаются именно туда, с этим связанны неудобства - нужно постоянно переносить их на карту памяти.\rНесколько неудобно что нету отдельной кнопки для фото. Подумываю купить батарею большей емкость мб что нибудь измениться.'

# Test Data Preprocessing

In [4]:
mystem = Mystem() 
swords = stopwords.words("russian") + ["еще", "ещё", "меж", "зато", "пусть", "ага", "этот", "это", "почему", 
                        "весь", "ты", "он", "она", "они", "оно", "мы", "вы", "кто", "что", 
                        "сам", "сама", "само", "свой", "наш", "ваш", "их", "тот", "та", "те", 
                        "то", "раз", "твой", "мой", "кой", "кое", "все", "весь", "всё", "быть", "тот", 
                        "таки", "такой", "какой", "каждый", "который", "и", "а", "в", "б", "д", 
                        "е", "ж", "з", "к", "л", "м", "н", "о", "п", "р", "с", "у", "ф", "ч", 
                        "ц", "ш", "щ", "ь", "ъ","э", "ю", "я"]
swords = [word for word in swords if word not in ['хорошо', 'лучше', 'может', 'никогда', 'нельзя', 'всегда']]
punctuation = [char for char in punctuation if char not in ['?', '!']]

  if __name__ == '__main__':


In [5]:
def preprocess_text(text):
    text = re.sub("\d+", "", text.lower())
    text = re.sub("\s+", " ", text)
    tokens = mystem.lemmatize(text)
    tokens = [token for token in tokens if token not in swords 
              and token != " " and token.strip() not in punctuation]
    text = " ".join(tokens)
    return text

In [79]:
test['preprocessed'] = test.text.apply(preprocess_text)

# Parsed Data Upload for Training

In [121]:
data = pd.read_csv('final.csv', sep='\t', encoding='utf-8')

In [123]:
data.review = data.review.astype(str)
data.prep_review = data.prep_review.astype(str)

# Vectorizer and Model Selection

## Preprocessed Data

In [40]:
print('Processed results:', '\n')


vectorizers = [CountVectorizer(), TfidfVectorizer()]
vnames = ['CountVectorizer', 'TfidfVectorizer'] 
models = [LinearSVC(), LogisticRegression()]
mnames = ['LinearSVC', 'LogisticRegression']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['prep_review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Processed results: 





CountVectorizer  and  LinearSVC :  0.8253479115231521 





CountVectorizer  and  LogisticRegression :  0.8428679397390034 

TfidfVectorizer  and  LinearSVC :  0.8500438353403537 





TfidfVectorizer  and  LogisticRegression :  0.8217927142641205 



In [41]:
print('Processed results:', '\n')

vectorizers = [CountVectorizer(analyzer='word', ngram_range=(1,2)), 
               TfidfVectorizer(analyzer='word', ngram_range=(1,2))]
vnames = ['CountVectorizer 1-2', 'TfidfVectorizer1-2'] 
models = [LinearSVC(), LogisticRegression()]
mnames = ['LinearSVC', 'LogisticRegression']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['prep_review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Processed results: 





CountVectorizer 1-2  and  LinearSVC :  0.8388088879931475 





CountVectorizer 1-2  and  LogisticRegression :  0.8500337582506173 

TfidfVectorizer1-2  and  LinearSVC :  0.8536222099057792 





TfidfVectorizer1-2  and  LogisticRegression :  0.8244591122083943 



In [31]:
count_vecs = CountVectorizer(analyzer='word', ngram_range=(1,2)).fit_transform(data['prep_review'])

In [32]:
tfidf_vecs =  TfidfVectorizer(analyzer='word', ngram_range=(1,2)).fit_transform(data['prep_review'])

In [46]:
param_grid = {'C': [0.5, 0.7, 0.8, 0.9]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid.fit(tfidf_vecs, data['rate'])
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 

Best cross-validation score: 0.86
Best parameters:  {'C': 0.7}
Best estimator:  LinearSVC(C=0.7, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [47]:
param_grid = {'C': [0.5, 0.6, 0.7, 0.8]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(count_vecs, data['rate'])
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 



Best cross-validation score: 0.85
Best parameters:  {'C': 0.6}
Best estimator:  LogisticRegression(C=0.6, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [48]:
print('Processed results:', '\n')
vectorizers = [CountVectorizer(analyzer='word', ngram_range=(1,2)), 
               TfidfVectorizer(analyzer='word', ngram_range=(1,2))]
vnames = ['CountVectorizer 1-2', 'TfidfVectorizer1-2'] 
models = [LinearSVC(C=0.7), LogisticRegression(C=0.6)]
mnames = ['LinearSVC C=0.7', 'LogisticRegression C=0.6']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['prep_review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Processed results: 





CountVectorizer 1-2  and  LinearSVC C=0.7 :  0.8392532876505265 





CountVectorizer 1-2  and  LogisticRegression C=0.6 :  0.8509326346551116 

TfidfVectorizer1-2  and  LinearSVC C=0.7 :  0.8545210863102735 





TfidfVectorizer1-2  and  LogisticRegression C=0.6 :  0.7997692346450345 



## Raw Data

In [51]:
print('Unprocessed results:', '\n')

vectorizers = [CountVectorizer(), 
               TfidfVectorizer()]
vnames = ['CountVectorizer', 'TfidfVectorizer'] 
models = [LinearSVC(), LogisticRegression()]
mnames = ['LinearSVC', 'LogisticRegression']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Unprocessed results: 





CountVectorizer  and  LinearSVC :  0.844651584622361 





CountVectorizer  and  LogisticRegression :  0.8585730840933138 

TfidfVectorizer  and  LinearSVC :  0.8653065954552325 





TfidfVectorizer  and  LogisticRegression :  0.833909406963269 



In [54]:
print('Unprocessed results:', '\n')

vectorizers = [CountVectorizer(analyzer='word', ngram_range=(1,2)), 
               TfidfVectorizer(analyzer='word', ngram_range=(1,2))]
vnames = ['CountVectorizer', 'TfidfVectorizer'] 
models = [LinearSVC(), LogisticRegression()]
mnames = ['LinearSVC', 'LogisticRegression']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Unprocessed results: 





CountVectorizer  and  LinearSVC :  0.8527434876807579 





CountVectorizer  and  LogisticRegression :  0.8612646747619287 

TfidfVectorizer  and  LinearSVC :  0.8702473925530307 





TfidfVectorizer  and  LogisticRegression :  0.8370333047815791 



In [58]:
param_grid = {'C': [0.2, 0.3, 0.5, 0.7]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(count_vecs, data['rate'])
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 



Best cross-validation score: 0.86
Best parameters:  {'C': 0.5}
Best estimator:  LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [56]:
param_grid = {'C': [0.7, 0.9, 1., 1.2, 1.5]}
grid = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid.fit(tfidf_vecs, data['rate'])
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_) 

Best cross-validation score: 0.87
Best parameters:  {'C': 1.2}
Best estimator:  LinearSVC(C=1.2, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [59]:
print('Unprocessed results:', '\n')

vectorizers = [CountVectorizer(analyzer='word', ngram_range=(1,2)), 
               TfidfVectorizer(analyzer='word', ngram_range=(1,2))]
vnames = ['CountVectorizer', 'TfidfVectorizer'] 
models = [LinearSVC(C=1.2), LogisticRegression(C=0.5)]
mnames = ['LinearSVC C=1.2', 'LogisticRegression C=0.5']

for vi, vectorizer in enumerate(vectorizers):
    for mi, model in enumerate(models):
        pline = Pipeline([("vectorizer", vectorizer),("classifier", model)])
        res = cross_val_score(pline, data['review'], data['rate'], scoring='accuracy', cv=5).mean()
        print(vnames[vi], ' and ', mnames[mi], ': ', res, '\n')

Unprocessed results: 





CountVectorizer  and  LinearSVC C=1.2 :  0.8505003275054165 





CountVectorizer  and  LogisticRegression C=0.5 :  0.8617181438000705 

TfidfVectorizer  and  LinearSVC C=1.2 :  0.8715936917418251 





TfidfVectorizer  and  LogisticRegression C=0.5 :  0.8011286340504862 



# Saving the Best Model

In [130]:
count = CountVectorizer(analyzer='word', ngram_range=(1,2))
count_train = count.fit_transform(data['review'])
count_test = count.transform(test.text)
joblib.dump(count, 'count.pkl')

['count.pkl']

In [125]:
lr6 = LogisticRegression(C=0.6).fit(count_train, data['rate'])

In [129]:
from sklearn.externals import joblib
joblib.dump(lr6, 'lr6.pkl')

['lr6.pkl']

In [126]:
preds_lr6 = lr6.predict(count_test)

In [127]:
sample = pd.read_csv('sample_submission.csv')
sample['y'] = preds_lr6
sample.to_csv('count_lr_unp.csv', index=False)