##Импорты и считывание

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem.snowball import SnowballStemmer
from nltk import wordnet, pos_tag
from nltk import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.dummy import DummyRegressor

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("train.csv", encoding='CP1256')

In [3]:
df.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

In [4]:
df.Review_Title.fillna('', inplace=True)

In [5]:
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


Как я понимаю, это очень и очень творческая домашка, поэтому я буду объяснять +- каждый шаг, чтобы вам было удобно проверять

##Вспомогательные функции для очистки текста

In [6]:
def clean(text):
    #deleting everything besides words and numbers
    return ' '.join(re.findall(r'\w+', text)).lower()

In [7]:
clean(df.Review_Text[1547])

'i arrived late at night from thailand looking forward to a drink at the bar only to be told that it is a muslim hotel and alcohol is strictly forbidden other than that disappointment it was very clean good staff and good value probably a hotel that not too many westerners stay at but good location'

Нам не очень нужны окончания, чтобы все было понятно, поэтому применяем $SnowballStemmer$

In [8]:
def stem(text): #глупая версия, не всегда точная
    stemmer = SnowballStemmer(language='english')
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [9]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

def lem(sent): #мне кажется, более умная версия
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])




##Переходим к обучению моделек

Работаем с $CountVectorizer$

Попробуем с $stemming$

In [10]:
def normalize(text):
    return stem(clean(text))

In [11]:
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [12]:
X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

In [14]:
X_train.head()

Unnamed: 0,Review_Title,Review_Text
1547,clean and big room,i arriv late at night from thailand look forwa...
1917,low price hotel with high qualiti,notic mani chang in this hotel check in proces...
543,bad place to stay,too mani mosquito late check in warm room frid...
1535,it was near to place where we had to go for engag,stay in hotel was comfort transport was easili...
494,overal good stay with clean room,stay was good overal i stay there for three da...


In [15]:
CV = CountVectorizer()
X_train_counts = CV.fit_transform(X_train.Review_Text)
X_test_counts = CV.transform(X_test.Review_Text)



In [16]:
dr = DummyRegressor()
dr.fit(X_train_counts, y_train)
y_pred_dr = dr.predict(X_test_counts)

print(mse(y_test, y_pred_dr))

455.85922892683305


Посмотрим, какое дерево нам лучше подойдет

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

trees = {
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state = 42),
    'RandomForestRegressor': RandomForestRegressor(random_state = 42)
}

for name in trees:
    cur_mod = trees[name] 
    cur_mod.fit(X_train_counts, y_train)
    y_pred = cur_mod.predict(X_test_counts)
    print(f'Для модели {name} mse: {mse(y_test, y_pred)}')

Для модели DecisionTreeRegressor mse: 358.3306217895322
Для модели RandomForestRegressor mse: 208.6526009265245


Ура, наша новая рабочая модель - $RandomForestRegressor$

Теперь подумаю о том, что, может быть, стоило учить на названии отзыва или на (название + текст) отзыва

In [18]:
CV = CountVectorizer()
X_train_counts_title = CV.fit_transform(X_train.Review_Title)
X_test_counts_title = CV.transform(X_test.Review_Title)
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_counts_title, y_train)
y_pred_title = rfr.predict(X_test_counts_title)
print(f'Для RFR MSE на только названиях: {mse(y_test, y_pred_title)}')


Для RFR MSE на только названиях: 278.51905082214216


In [19]:
CV = CountVectorizer()
X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название: 180.85505756832572


Теперь попробуем с $lemmatizer$

In [20]:
def normalize(text):
    return lem(clean(text))

In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

In [24]:
X_train.head()

Unnamed: 0,Review_Title,Review_Text
1547,clean and big room,i arrive late at night from thailand look forw...
1917,low price hotel with high quality,notice many change in this hotel check in proc...
543,bad place to stay,too many mosquito late check in warm room frid...
1535,it be near to place where we have to go for engag,stay in hotel be comfortable transport be easi...
494,overall good stay with clean room,stay be good overall i stay there for three da...


In [25]:
CV = CountVectorizer()
X_train_counts = CV.fit_transform(X_train.Review_Text)
X_test_counts = CV.transform(X_test.Review_Text)



In [26]:
dr = DummyRegressor()
dr.fit(X_train_counts, y_train)
y_pred_dr = dr.predict(X_test_counts)

print(mse(y_test, y_pred_dr))

455.85922892683305


In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

trees = {
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state = 42),
    'RandomForestRegressor': RandomForestRegressor(random_state = 42)
}

for name in trees:
    cur_mod = trees[name] 
    cur_mod.fit(X_train_counts, y_train)
    y_pred = cur_mod.predict(X_test_counts)
    print(f'Для модели {name} mse: {mse(y_test, y_pred)}')

Для модели DecisionTreeRegressor mse: 347.88122570456756
Для модели RandomForestRegressor mse: 206.26709980400636


In [28]:
CV = CountVectorizer()
X_train_counts_title = CV.fit_transform(X_train.Review_Title)
X_test_counts_title = CV.transform(X_test.Review_Title)
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_counts_title, y_train)
y_pred_title = rfr.predict(X_test_counts_title)
print(f'Для RFR MSE на только названиях: {mse(y_test, y_pred_title)}')


Для RFR MSE на только названиях: 281.58361836394437


In [29]:
CV = CountVectorizer()
X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название: 184.05559002810276


В общем глобально круче всего работает всё на $lemmatizer$, модель - $RandomForestRegressor$

Попробуем еще подабовлять разные параметры

##Эксперименты с RandomForestRegressor

In [30]:
f = open('negative-words.txt', encoding='CP1256')
lines = f.readlines()
bad_words = [line[:-1] for line in lines]
print(bad_words)




In [31]:
f = open('positive-words.txt', encoding='CP1256')
lines = f.readlines()
good_words = [line[:-1] for line in lines]
print(good_words)

['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade', 'accolades', 'accommodative', 'accomodative', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accurate', 'accurately', 'achievable', 'achievement', 'achievements', 'achievible', 'acumen', 'adaptable', 'adaptive', 'adequate', 'adjustable', 'admirable', 'admirably', 'admiration', 'admire', 'admirer', 'admiring', 'admiringly', 'adorable', 'adore', 'adored', 'adorer', 'adoring', 'adoringly', 'adroit', 'adroitly', 'adulate', 'adulation', 'adulatory', 'advanced', 'advantage', 'advantageous', 'advantageously', 'advantages', 'adventuresome', 'adventurous', 'advocate', 'advocated', 'advocates', 'affability', 'affable', 'affably', 'affectation', 'affection', 'affectionate', 'affinity', 'affirm', 'affirmation', 'affirmative', 'affluence', 'affluent', 'afford', 'affordable', 'affordably', 'afordable', 'agile', 'agilely', 'agility', 'agreeable', 'ag

In [32]:
def normalize(text):
    cur = ' '.join([word for word in text.split() if not (word in bad_words)])
    return lem(clean(cur))

In [33]:
#попробуем удалить ВСЕ плохие слова

CV = CountVectorizer()

X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)


rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название с удаленными плохими словами: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название с удаленными плохими словами: 199.50551400359578


In [34]:
def normalize(text):
    cur = ' '.join([word for word in text.split() if  (word in good_words)])
    return lem(clean(cur))

In [35]:
#попробуем оставить только хорошие слова

CV = CountVectorizer()

X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)


rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название с только хорошими словами: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название с только хорошими словами: 340.0259490992173


In [36]:
def normalize(text):
    cur = ' '.join([word for word in text.split() if not (word in good_words)])
    return lem(clean(cur))

In [37]:
#попробуем удалить ВСЕ хорошие слова

CV = CountVectorizer()

X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)


rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название с удаленными хорошими словами: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название с удаленными хорошими словами: 191.5259047083004


In [38]:
def normalize(text):
    cur = ' '.join([word for word in text.split() if (word in bad_words)])
    return lem(clean(cur))

In [39]:
#попробуем оставить только плохие слова

CV = CountVectorizer()

X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)


rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название с только плохими словами: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название с только плохими словами: 354.9335509951479


In [40]:
def normalize(text):
    cur = ' '.join([word for word in text.split() if  ((word in bad_words) or (word in good_words))])
    return lem(clean(cur))

In [41]:
#попробуем оставить только плохие и хорошие слова

CV = CountVectorizer()

X, y = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X.Review_Text = X.Review_Text.apply(normalize)
X.Review_Title = X.Review_Title.apply(normalize)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 42)

X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)


rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
print(f'Для RFR MSE на текст + название с только хорошими и плохими словами: {mse(y_test, y_pred_glob)}')

Для RFR MSE на текст + название с только хорошими и плохими словами: 301.30301078492226


##Вывод

В общем-то MSE лучшего, чем  $184.05559002810276$ (для моего теста) я не получила.

Это достигается на $CountVectorizer(), RandomForestRegressor()$

In [42]:
def normalize(text):
    return lem(clean(text))

In [43]:
X_train, y_train = df.drop(columns=['Id', 'Hotel_name','Rating']), df.Rating 
X_train.Review_Text = X_train.Review_Text.apply(normalize)
X_train.Review_Title = X_train.Review_Title.apply(normalize)

In [46]:
X_test = pd.read_csv('test.csv', encoding= 'CP1256')
X_test.Review_Text.fillna('', inplace = True)
X_test.Review_Title.fillna('', inplace = True)
CV = CountVectorizer()
X_train_counts_glob = CV.fit_transform(X_train.Review_Title + ' ' + X_train.Review_Text)
X_test_counts_glob = CV.transform(X_test.Review_Title + ' ' + X_test.Review_Text)
rfr = RandomForestRegressor(random_state = 42)
rfr.fit(X_train_counts_glob, y_train)
y_pred_glob = rfr.predict(X_test_counts_glob)
df_ans = pd.DataFrame()
df_ans['Id'] = X_test.Id
df_ans['Rating'] = y_pred_glob

df_ans.to_csv('ans.csv', index = False)