## Анализ тональности (обзор ресторанов)

### Подключение библиотек

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### Загрузка данных

In [3]:
df = pd.read_csv('Restaurant reviews.csv')
print(df.head())
print(df.shape)

        Restaurant              Reviewer  \
0  Beyond Flavours     Rusha Chakraborty   
1  Beyond Flavours  Anusha Tirumalaneedi   
2  Beyond Flavours       Ashok Shekhawat   
3  Beyond Flavours        Swapnil Sarkar   
4  Beyond Flavours                Dileep   

                                              Review Rating  \
0  The ambience was good, food was quite good . h...      5   
1  Ambience is too good for a pleasant evening. S...      5   
2  A must try.. great food great ambience. Thnx f...      5   
3  Soumen das and Arun was a great guy. Only beca...      5   
4  Food is good.we ordered Kodi drumsticks and ba...      5   

                  Metadata             Time  Pictures    7514  
0   1 Review , 2 Followers  5/25/2019 15:54         0  2447.0  
1  3 Reviews , 2 Followers  5/25/2019 14:20         0     NaN  
2  2 Reviews , 3 Followers  5/24/2019 22:54         0     NaN  
3    1 Review , 1 Follower  5/24/2019 22:11         0     NaN  
4  3 Reviews , 2 Followers  5/24/201

### Очистка данных

In [4]:
df = df.drop(['Restaurant', 'Reviewer', 'Metadata', 'Pictures', '7514'], axis=1)
df.head()

Unnamed: 0,Review,Rating,Time
0,"The ambience was good, food was quite good . h...",5,5/25/2019 15:54
1,Ambience is too good for a pleasant evening. S...,5,5/25/2019 14:20
2,A must try.. great food great ambience. Thnx f...,5,5/24/2019 22:54
3,Soumen das and Arun was a great guy. Only beca...,5,5/24/2019 22:11
4,Food is good.we ordered Kodi drumsticks and ba...,5,5/24/2019 21:37


### Преобразование времени

In [5]:
df['Time'] = list(map(lambda data: str(data).split()[0], df['Time']))
df['Time'] = list(map(lambda data: str(data).split('/')[-1], df['Time']))
df.head()

Unnamed: 0,Review,Rating,Time
0,"The ambience was good, food was quite good . h...",5,2019
1,Ambience is too good for a pleasant evening. S...,5,2019
2,A must try.. great food great ambience. Thnx f...,5,2019
3,Soumen das and Arun was a great guy. Only beca...,5,2019
4,Food is good.we ordered Kodi drumsticks and ba...,5,2019


In [6]:
df['Time'].unique()

array(['2019', '2018', '2017', '2016', 'nan'], dtype=object)

In [7]:
df['Time'] = np.where(df['Time'] == 'nan', df['Time'].value_counts().idxmax(), df['Time'])
df['Time'].unique()

array(['2019', '2018', '2017', '2016'], dtype=object)

In [10]:
mmsTime = MinMaxScaler()
mmsTime.fit(df[['Time']])
df['Time'] = mmsTime.transform(df[['Time']])
df['Time'].unique()

array([1.        , 0.66666667, 0.33333333, 0.        ])

In [11]:
df.head()

Unnamed: 0,Review,Rating,Time
0,"The ambience was good, food was quite good . h...",5,1.0
1,Ambience is too good for a pleasant evening. S...,5,1.0
2,A must try.. great food great ambience. Thnx f...,5,1.0
3,Soumen das and Arun was a great guy. Only beca...,5,1.0
4,Food is good.we ordered Kodi drumsticks and ba...,5,1.0


### Работа с рейтингом

In [12]:
df['Rating'].unique()

array(['5', '4', '1', '3', '2', '3.5', '4.5', '2.5', '1.5', 'Like', nan],
      dtype=object)

In [13]:
df['Rating'].isna().sum()

38

In [14]:
df['Rating'].value_counts()

Rating
5       3832
4       2373
1       1735
3       1193
2        684
4.5       69
3.5       47
2.5       19
1.5        9
Like       1
Name: count, dtype: int64

In [15]:
df['Rating'] = np.where(df['Rating'] == 'Like', df['Rating'].value_counts().idxmax(), df['Rating'])

In [16]:
df['Rating'].unique()

array(['5', '4', '1', '3', '2', '3.5', '4.5', '2.5', '1.5', nan],
      dtype=object)

In [17]:
df['Rating'] = list(map(lambda data: float(data) >= 3, df['Rating']))
df['Rating'].unique()

array([ True, False])

### Работа с текстом

In [18]:
df['Review'].isnull().sum()

45

In [19]:
df['Review'] = df['Review'].fillna('Nothing')
df['Review'].isnull().sum()

0

### Обработка естественного языка

In [20]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
ps = PorterStemmer()

In [22]:
reviewList = list(map(lambda data: word_tokenize(data), df['Review']))

In [24]:
for index, word_list in enumerate(reviewList):
    reviewList[index] = ' '.join([ps.stem(str(word)) for word in word_list if not word in stopwords.words('english') and word not in ['.', ',', '?', '@', '$', '/'] and not word.isspace()])

In [25]:
reviewList

['the ambienc good food quit good saturday lunch cost effect good place sate brunch one also chill friend parent waiter soumen da realli courteou help',
 'ambienc good pleasant even servic prompt food good over good experi soumen da - kudo servic',
 'a must tri .. great food great ambienc thnx servic pradeep subroto my person recommend penn alfredo pasta : ) ....... also music background amaz',
 'soumen da arun great guy onli behavior sincereti and good food cours i would like visit place',
 'food good.w order kodi drumstick basket mutton biryani all good thank pradeep he serv well we enjoy ambienc also good',
 'ambianc good servic good food apradeecp subro best servic food good papiya good hostess ur caption good thi 4star restaur',
 'it nice place ambienc differ food order tasti servic also gud worth visit it reason well realli must visit place',
 'well read mani review final visit place ... ambienc good come food crispi corn nice tawa fish ok ... basket biryani disappoint us ... bir

In [26]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF" 
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)

regexPuct = r"[\s\w\d]"

In [27]:
	
for index, word_list in enumerate(reviewList):
    reviewList[index] = re.sub(emoji_pattern, r'', word_list)
    reviewList[index] = ''.join(re.findall(regexPuct, reviewList[index], re.MULTILINE))
    reviewList[index] = ' '.join(reviewList[index].split())

In [29]:
reviewList

['the ambienc good food quit good saturday lunch cost effect good place sate brunch one also chill friend parent waiter soumen da realli courteou help',
 'ambienc good pleasant even servic prompt food good over good experi soumen da kudo servic',
 'a must tri great food great ambienc thnx servic pradeep subroto my person recommend penn alfredo pasta also music background amaz',
 'soumen da arun great guy onli behavior sincereti and good food cours i would like visit place',
 'food goodw order kodi drumstick basket mutton biryani all good thank pradeep he serv well we enjoy ambienc also good',
 'ambianc good servic good food apradeecp subro best servic food good papiya good hostess ur caption good thi 4star restaur',
 'it nice place ambienc differ food order tasti servic also gud worth visit it reason well realli must visit place',
 'well read mani review final visit place ambienc good come food crispi corn nice tawa fish ok basket biryani disappoint us biryani ok flatter claim staff po

In [28]:
df['Review'] = reviewList
df.head()

Unnamed: 0,Review,Rating,Time
0,the ambienc good food quit good saturday lunch...,True,1.0
1,ambienc good pleasant even servic prompt food ...,True,1.0
2,a must tri great food great ambienc thnx servi...,True,1.0
3,soumen da arun great guy onli behavior sincere...,True,1.0
4,food goodw order kodi drumstick basket mutton ...,True,1.0


### Разделение данных

In [30]:
X = df['Review']
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23, stratify=y)

In [31]:
X_train.iloc[0]

'one word yukk uncook smelli grill prawn tasteless chicken biriyani kunda dum serv dumbest waiter b narasimha when report qualiti gave irrespons respons serv bother anyth els'

In [32]:
y_train.iloc[0]

False

### Построение модели

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [35]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,3))), ('classifier', SVC())
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.8975

### Тестирование модели

In [36]:
positive_sentence = "I absolutely love this product! It has exceeded all my expectations. It's well-designed, durable, and incredibly useful. I can't imagine my life without it now. I would highly recommend it to anyone looking for a high-quality, reliable product"
negative_sentence = "I bought this product, and it was a complete waste of money. It stopped working after just a week, and the customer service was unhelpful and rude. I will never purchase anything from this company again."
sentence = "I purchased this product, and it has been nothing but a disappointment. The quality is abysmal, and it didn't even last a week before falling apart. It's clear that the manufacturer prioritized cutting costs over producing a reliable item. I would strongly advise against buying this product – it's a complete waste of money."
sentence_2 = "I hate this product"

In [37]:
pipeline.predict([positive_sentence, negative_sentence, sentence, sentence_2])

array([ True,  True,  True,  True])

In [38]:
def preprocess(text):
    text = word_tokenize(text)
    text = " ".join([ps.stem(str(word)) for word in text if not word in stopwords.words("english") and word not in [".", ",", "?", "@", "$", "/"] and not word.isspace()])
    text = re.sub(emoji_pattern, r'', text)
    text = "".join(re.findall(regexPuct, text, re.MULTILINE))
    text = " ".join(text.split())

    return text

In [39]:
positive_sentence = preprocess(positive_sentence)
negative_sentence = preprocess(negative_sentence)
sentence = preprocess(sentence)
sentence_2 = preprocess(sentence_2)

In [41]:
sentence_2

'i hate product'

In [40]:
pipeline.predict([positive_sentence, negative_sentence, sentence, sentence_2])

array([ True, False, False,  True])