## Работа с набором данных Amazon Alexa

### EDA

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
import plotly.figure_factory as ff

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
df = pd.read_csv("amazon_alexa.tsv", sep="\t")

In [3]:
df.head(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [5]:
df["variation"].value_counts()

Black  Dot                      516
Charcoal Fabric                 430
Configuration: Fire TV Stick    350
Black  Plus                     270
Black  Show                     265
Black                           261
Black  Spot                     241
White  Dot                      184
Heather Gray Fabric             157
White  Spot                     109
White                            91
Sandstone Fabric                 90
White  Show                      85
White  Plus                      78
Oak Finish                       14
Walnut Finish                     9
Name: variation, dtype: int64

In [6]:
df.groupby("variation").agg({"rating":"mean"}).sort_values("rating", ascending=False).head()

Unnamed: 0_level_0,rating
variation,Unnamed: 1_level_1
Walnut Finish,4.888889
Oak Finish,4.857143
Charcoal Fabric,4.730233
Heather Gray Fabric,4.694268
Configuration: Fire TV Stick,4.591429


In [7]:
df["feedback"].value_counts().head()

1    2893
0     257
Name: feedback, dtype: int64

### Построение изображений с помощью plotly

In [10]:
trace0 = go.Bar(
    x = df[df["feedback"]==1]["feedback"].value_counts().index.values,
    y = df[df["feedback"]==1]["feedback"].value_counts().values,
    name = "Positive Feedback")

trace1 = go.Bar(
    x = df[df["feedback"]==0]["feedback"].value_counts().index.values,
    y = df[df["feedback"]==0]["feedback"].value_counts().values,
    name = "Negative Feedback")

data = [trace0, trace1]

layout = go.Layout(yaxis=dict(title="Count"), xaxis=dict(title="Feedback"), title="Feedback Distribution")
fig = go.Figure(data=data, layout=layout)
fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
fig.data[1].marker.line.width = 1
fig.data[1].marker.line.color = "black"
py.iplot(fig)


In [11]:
trace0 = go.Bar(
    x = df[df["feedback"]==1]["variation"].value_counts().index.values,
    y = df[df["feedback"]==1]["variation"].value_counts().values,
    name = "Positive Feedback")

trace1 = go.Bar(
    x = df[df["feedback"]==0]["variation"].value_counts().index.values,
    y = df[df["feedback"]==0]["variation"].value_counts().values,
    name = "Negative Feedback")

data = [trace0, trace1]

layout = go.Layout(yaxis=dict(title="Count"), xaxis=dict(title="Feedback"), title="Feedback Distribution")
fig = go.Figure(data=data, layout=layout)
fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
fig.data[1].marker.line.width = 1
fig.data[1].marker.line.color = "black"
py.iplot(fig)

In [14]:
rating = df.groupby('variation').agg({"rating":"mean"})
rating['variation'] = rating.index
rating.reset_index(drop=True)

trace = go.Bar(x=rating['variation'], y=rating['rating'])

layout = go.Layout(yaxis=dict(title='Average Rating'),
                   xaxis=dict(title='Alexa Product'),title='Product - Avarage Rating Distribution')

fig = go.Figure(data=trace, layout=layout)
fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
py.iplot(fig)

In [15]:
trace = go.Bar(
            x = df["rating"].value_counts().index.values,
            y = df["rating"].value_counts().values,
            name='Quantity')

layout = go.Layout(yaxis=dict(title='Quantity'),
                   xaxis=dict(title='Ratings'),title='# of Votes Quantity')

fig = go.Figure(data=trace, layout=layout)
fig.data[0].marker.line.width = 2
fig.data[0].marker.line.color = "black"
py.iplot(fig)

### Очистка данных и извлечение признаков

In [17]:
df = df.drop(["date", "rating"], axis=1)
df.head()

Unnamed: 0,variation,verified_reviews,feedback
0,Charcoal Fabric,Love my Echo!,1
1,Charcoal Fabric,Loved it!,1
2,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,Charcoal Fabric,Music,1


In [18]:
variation_dummies = pd.get_dummies(df["variation"], drop_first=True)
variation_dummies.head()

Unnamed: 0,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [19]:
df.drop(["variation"], axis=1, inplace=True)
df.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [20]:
df = pd.concat([df, variation_dummies], axis=1)
df.head()

Unnamed: 0,verified_reviews,feedback,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot
0,Love my Echo!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,Loved it!,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,"Sometimes while playing a game, you can answer...",1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,I have had a lot of fun with this thing. My 4 ...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,Music,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [21]:
vectorizer = CountVectorizer()
alexa_countvectorizer = vectorizer.fit_transform(df["verified_reviews"])
alexa_countvectorizer.shape

(3150, 4044)

In [22]:
alexa_countvectorizer.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
df.drop(["verified_reviews"], axis=1, inplace=True)
df.head()

Unnamed: 0,feedback,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,Walnut Finish,White,White Dot,White Plus,White Show,White Spot
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [24]:
encoded_reviews = pd.DataFrame(alexa_countvectorizer.toarray())
encoded_reviews.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4034,4035,4036,4037,4038,4039,4040,4041,4042,4043
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df = pd.concat([df, encoded_reviews], axis=1)
df.head()

Unnamed: 0,feedback,Black Dot,Black Plus,Black Show,Black Spot,Charcoal Fabric,Configuration: Fire TV Stick,Heather Gray Fabric,Oak Finish,Sandstone Fabric,...,4034,4035,4036,4037,4038,4039,4040,4041,4042,4043
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Построение и настройка модели

In [27]:
X = df.drop(["feedback"], axis=1)
y = df["feedback"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [28]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
X.columns = X.columns.astype(str)

In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2362, 4059), (788, 4059), (2362,), (788,))

In [32]:
rf_params = {"n_estimators": [100,200,500,1000],
"max_features": [3,5,7],
"min_samples_split": [2,5,10,30],
"max_depth": [3,5,8,None]}

rf_model = RandomForestClassifier(random_state=42)

rf_cv_model = GridSearchCV(rf_model, rf_params, cv=10, n_jobs=-1, verbose=1).fit(X_train, y_train)




Fitting 10 folds for each of 192 candidates, totalling 1920 fits


NameError: name 'fit' is not defined

In [33]:
rf_tuned = RandomForestClassifier(**rf_cv_model.best_params_).fit(X,y)

In [34]:
tuned = [rf_tuned]
print("Best Model Parameters and Scores:")
for i in tuned:
    score = cross_val_score(i, X_train, y_train, cv = 10).mean()
    print(i,"score:",score)

Best Model Parameters and Scores:
RandomForestClassifier(max_features=7) score: 0.9309912036043768
