In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import re

## データの前処理

In [6]:
# データの読み込み
df = pd.read_csv("Musical_instruments_reviews.csv")
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [7]:
# null値の確認
df.isnull().sum()

reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         7
overall            0
summary            0
unixReviewTime     0
reviewTime         0
dtype: int64

In [8]:
# データ型の確認
df.dtypes

reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
dtype: object

In [9]:
# reviewTextのnull値を確認
df[df["reviewText"].isnull()]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
2927,AF7AQHLX1UV1Z,B0002H0H4A,Smoothbassman,"[101, 105]",,5.0,The Pop Rocks with the Yeti,1342656000,"07 19, 2012"
4401,AU3GYRAKBUAEU,B000CD1R7K,Jon Norris,"[189, 192]",,4.0,great foot rest,1258416000,"11 17, 2009"
8739,A3N93PUHEMQU34,B004OU2IQG,"L. Su ""Flying train""","[3, 3]",,5.0,To make you sound like a pro,1388102400,"12 27, 2013"
9175,AMEZJVXMSEPY0,B005FKF1PY,M. Luke,"[1, 1]",,5.0,Mini tech for musicians,1319500800,"10 25, 2011"
9306,A3ABAA36CZVYNX,B005M0TKL8,Dave,"[31, 32]",,5.0,"No power = No Sound, But It Sounds GREAT!",1342569600,"07 18, 2012"
9313,A3OEBPVLF35YQL,B005M0TKL8,StratDude,"[20, 21]",,5.0,This would be a must-have at twice the price.,1350172800,"10 14, 2012"
9342,A3OA4DV4L81N1D,B005NACC6M,"P.K. Frary ""Gochugogi""","[4, 4]",,5.0,Great sound and features for the price!,1387411200,"12 19, 2013"


In [10]:
# null値を''で埋める
df["reviewText"] = df["reviewText"].fillna('')

In [12]:
# null値の再確認
df.isnull().sum()

reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         0
overall            0
summary            0
unixReviewTime     0
reviewTime         0
dtype: int64

## 絵文字・url・html・改行・半角スペース・全角スペースが２つ以上連続で含まれている可能性があるので削除する

## reviewTextに適用

In [14]:
def clean_text(text):
  """
  text内のURL、HTMLタグ、絵文字、改行を削除する関数

  Args:
    text (str): 処理対象のテキスト

  Returns:
    str: クリーニング後のテキスト
  """

  # URLの削除
  text = re.sub(r"https?://[\w/:%#.\-~?\=]+", "", text)

  # HTMLタグの削除
  text = re.sub(r"<[^>]+>", "", text)

  # 絵文字の削除
  text = re.sub(r"[^\w\s]", "", text)

  # 改行の削除
  text = re.sub(r"\n", "", text)

  return text

In [15]:
# reviewTextカラムの各行に対して処理を実行
for i in range(len(df)):
  df.at[i, 'reviewText'] = clean_text(df.at[i, 'reviewText'])

In [16]:
df["reviewText"]

0        Not much to write about here but it does exact...
1        The product does exactly as it should and is q...
2        The primary job of this device is to block the...
3        Nice windscreen protects my MXL mic and preven...
4        This pop filter is great It looks and performs...
                               ...                        
10256                 Great just as expected  Thank to all
10257    Ive been thinking about trying the Nanoweb str...
10258    I have tried coated strings in the past  inclu...
10259    Well MADE by Elixir and DEVELOPED with Taylor ...
10260    These strings are really quite good but I woul...
Name: reviewText, Length: 10261, dtype: object

In [23]:
def remove_extra_spaces(text):
  """
  textの文頭・文末および文章中の不要なスペースを削除する関数

  Args:
    text (str): 処理対象のテキスト

  Returns:
    str: クリーニング後のテキスト
  """
  # 全角スペースを含む空白文字を1個の半角スペースに置換
  text = re.sub(r"\s+", " ", text)

  # 文頭のスペースを削除
  text = re.sub(r"^\s+", "", text)

  # 文末のスペースを削除
  text = re.sub(r"\s+$", "", text)

  return text

In [24]:
# reviewTextカラムの各行に対して処理を実行
for i in range(len(df)):
  df.at[i, 'reviewText'] = remove_extra_spaces(df.at[i, 'reviewText'])

In [25]:
df["reviewText"]

0        Not much to write about here but it does exact...
1        The product does exactly as it should and is q...
2        The primary job of this device is to block the...
3        Nice windscreen protects my MXL mic and preven...
4        This pop filter is great It looks and performs...
                               ...                        
10256                  Great just as expected Thank to all
10257    Ive been thinking about trying the Nanoweb str...
10258    I have tried coated strings in the past includ...
10259    Well MADE by Elixir and DEVELOPED with Taylor ...
10260    These strings are really quite good but I woul...
Name: reviewText, Length: 10261, dtype: object

## summaryに適用

In [27]:
df["summary"]

0                                                     good
1                                                     Jake
2                                     It Does The Job Well
3                            GOOD WINDSCREEN FOR THE MONEY
4                    No more pops when I record my vocals.
                               ...                        
10256                                           Five Stars
10257    Long life, and for some players, a good econom...
10258                                     Good for coated.
10259                                          Taylor Made
10260    These strings are really quite good, but I wou...
Name: summary, Length: 10261, dtype: object

In [28]:
# summaryカラムの各行に対して処理を実行
for i in range(len(df)):
  df.at[i, 'summary'] = clean_text(df.at[i, 'summary'])

In [30]:
# summaryカラムの各行に対して処理を実行
for i in range(len(df)):
  df.at[i, 'summary'] = remove_extra_spaces(df.at[i, 'summary'])

In [31]:
df["summary"]

0                                                     good
1                                                     Jake
2                                     It Does The Job Well
3                            GOOD WINDSCREEN FOR THE MONEY
4                     No more pops when I record my vocals
                               ...                        
10256                                           Five Stars
10257    Long life and for some players a good economic...
10258                                      Good for coated
10259                                          Taylor Made
10260    These strings are really quite good but I woul...
Name: summary, Length: 10261, dtype: object

## 不要なカラムを削除する

In [33]:
drop_col = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'unixReviewTime', 'reviewTime']
df = df.drop(columns=drop_col)
df

Unnamed: 0,reviewText,overall,summary
0,Not much to write about here but it does exact...,5.0,good
1,The product does exactly as it should and is q...,5.0,Jake
2,The primary job of this device is to block the...,5.0,It Does The Job Well
3,Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY
4,This pop filter is great It looks and performs...,5.0,No more pops when I record my vocals
...,...,...,...
10256,Great just as expected Thank to all,5.0,Five Stars
10257,Ive been thinking about trying the Nanoweb str...,5.0,Long life and for some players a good economic...
10258,I have tried coated strings in the past includ...,4.0,Good for coated
10259,Well MADE by Elixir and DEVELOPED with Taylor ...,4.0,Taylor Made


In [34]:
# reviewTextとsummaryを半角スペースでつなげてtextとする
df["text"] = df["reviewText"] + ' ' + df["summary"]
df

Unnamed: 0,reviewText,overall,summary,text
0,Not much to write about here but it does exact...,5.0,good,Not much to write about here but it does exact...
1,The product does exactly as it should and is q...,5.0,Jake,The product does exactly as it should and is q...
2,The primary job of this device is to block the...,5.0,It Does The Job Well,The primary job of this device is to block the...
3,Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,Nice windscreen protects my MXL mic and preven...
4,This pop filter is great It looks and performs...,5.0,No more pops when I record my vocals,This pop filter is great It looks and performs...
...,...,...,...,...
10256,Great just as expected Thank to all,5.0,Five Stars,Great just as expected Thank to all Five Stars
10257,Ive been thinking about trying the Nanoweb str...,5.0,Long life and for some players a good economic...,Ive been thinking about trying the Nanoweb str...
10258,I have tried coated strings in the past includ...,4.0,Good for coated,I have tried coated strings in the past includ...
10259,Well MADE by Elixir and DEVELOPED with Taylor ...,4.0,Taylor Made,Well MADE by Elixir and DEVELOPED with Taylor ...


In [35]:
# reviewTextとsummaryは削除する
df.drop(columns=["reviewText", "summary"], inplace=True)
df

Unnamed: 0,overall,text
0,5.0,Not much to write about here but it does exact...
1,5.0,The product does exactly as it should and is q...
2,5.0,The primary job of this device is to block the...
3,5.0,Nice windscreen protects my MXL mic and preven...
4,5.0,This pop filter is great It looks and performs...
...,...,...
10256,5.0,Great just as expected Thank to all Five Stars
10257,5.0,Ive been thinking about trying the Nanoweb str...
10258,4.0,I have tried coated strings in the past includ...
10259,4.0,Well MADE by Elixir and DEVELOPED with Taylor ...


In [36]:
df["overall"].unique()

array([5., 3., 4., 2., 1.])

In [37]:
# 1, 2・・negative
# 3・・・ neutral
# 4, 5をpositiveとする

def assign_label(overall):
  """
  overall値に基づいてラベルを割り当てる関数

  Args:
    overall (float): overall値

  Returns:
    str: ラベル
  """
  if overall >= 4.0:
    return "positive"
  elif overall == 3.0:
    return "neutral"
  else:
    return "negative"

In [38]:
df["label"] = df["overall"].apply(assign_label)

In [40]:
df.drop(columns=["overall"], inplace=True)

In [41]:
df.head()

Unnamed: 0,text,label
0,Not much to write about here but it does exact...,positive
1,The product does exactly as it should and is q...,positive
2,The primary job of this device is to block the...,positive
3,Nice windscreen protects my MXL mic and preven...,positive
4,This pop filter is great It looks and performs...,positive


## 訓練データ・検証データ・テストデータを作成

In [42]:
# dfを訓練データ・テストデータに分割
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True, random_state=0, stratify=df["label"])
train_data

Unnamed: 0,text,label
9519,Ive tried so many other strings over the years...,positive
7527,This cuts strings with no problems It doesnt t...,positive
3784,Probably the best capo I have ever had Its eas...,positive
8492,im not a profesional player but i like to play...,positive
1549,These strings are great for a 12 string I star...,positive
...,...,...
9017,i use thin picks 38 46 and 50 so this pack get...,neutral
9976,I had been using a full boom mic stand at my s...,neutral
1904,I needed a solid throne to replace my IKEA com...,positive
4021,What can I say about a guitar strap It fits th...,positive


In [43]:
# test_dataを検証データ・テストデータに分割
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=0, stratify=test_data["label"])
valid_data

Unnamed: 0,text,label
2163,Ive been using the DAddario strings for about ...,positive
7020,I ordered this product to review for possible ...,positive
9769,This is a good budget uke Im not a ukulele exp...,positive
3558,Useful for connecting mics to a guitar amp or ...,positive
5833,My guitars largely stay at home so they dont r...,positive
...,...,...
6260,Great deal for the price and very sturdily mad...,positive
338,Excellent simple product Fits all of my guitar...,positive
8056,I have 6 amps One of which is a good tube amp ...,positive
6622,I originally bought several of these cables to...,positive


In [44]:
test_data

Unnamed: 0,text,label
3750,Doesnt color the sound depending on how youre ...,positive
1246,This tool does exactly what it is supposed to ...,positive
538,Whats great about the Dunlop designThey work T...,positive
4760,Unit works fine with my digital piano Its ligh...,positive
5837,Not as good as the headstock tuner I purchased...,neutral
...,...,...
6227,Not sure how to rate this tube Replaced a Chin...,neutral
3131,These are the original perfect fit on my ameri...,positive
7302,I really like this foot controller because you...,positive
4074,fits just fineI have two electric guitars and ...,positive


In [45]:
test_data.label.unique()

array(['positive', 'neutral', 'negative'], dtype=object)

In [46]:
train_data.label.unique()

array(['positive', 'neutral', 'negative'], dtype=object)

In [47]:
valid_data.label.unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [48]:
# それぞれのデータをcsvとして保存
train_data.to_csv("train_data.csv", index=False, header=False)
valid_data.to_csv("valid_data.csv", index=False, header=False)
test_data.to_csv("test_data.csv", index=False, header=False)