In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import nltk

In [2]:
# 1. 資料前處理
## (a)讀取csv檔僅保留"text"、"stars"兩個欄位
df = pd.read_csv('./archive/yelp.csv')
cols = ['text','stars']
df = df[cols]

In [3]:
## 將stars欄位內值大於等於4的轉成1，其餘轉成0
df['stars'] = [0 if star < 4 else 1 for star in df['stars']]
df

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,1
1,I have no idea why some people give bad review...,1
2,love the gyro plate. Rice is so good and I als...,1
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1
4,General Manager Scott Petello is a good egg!!!...,1
...,...,...
9995,First visit...Had lunch here today - used my G...,0
9996,Should be called house of deliciousness!\n\nI ...,1
9997,I recently visited Olive and Ivy for business ...,1
9998,My nephew just moved to Scottsdale recently so...,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
value_list = [row[0] for row in df.itertuples(index=False, name=None)]
cv = CountVectorizer()
X_train = cv.fit_transform(value_list)
X_train.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [5]:
# 資料前處理 
# 1.將text欄位內的文字利用分割符號切割
# 2.去除停頓詞 stop_words
# 3.更新成詞幹
import nltk
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
def clean(text):
    lemmas_sent = []
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words(
        'english')  + ['super', 'duper', 'place']  # 自訂 word2Vec
    tokens = nltk.word_tokenize(text)
    lower = [word.lower() for word in tokens]
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    taged_sent = nltk.pos_tag(no_alpha)
    for tag in taged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wn.lemmatize(tag[0], pos=wordnet_pos))
    clean_text = lemmas_sent    
    return clean_text

In [6]:
df_clean = df
for i, line in enumerate(df['text']):
    df_clean['text'][i] = ' '.join(clean(line))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['text'][i] = ' '.join(clean(line))


In [7]:
print(df_clean)

                                                   text  stars
0     wife take birthday breakfast excellent weather...      1
1     idea people give bad review go show please eve...      1
2     love gyro plate rice good also dig candy selec...      1
3     rosie dakota love chaparral dog park convenien...      1
4     general manager scott petello good egg go deta...      1
...                                                 ...    ...
9995  first visit lunch today use groupon order brus...      0
9996  call house deliciousness could go item item bl...      1
9997  recently visit olive ivy business last week vi...      1
9998  nephew move scottsdale recently bunch friend b...      0
9999  location star average think arizona really fan...      1

[10000 rows x 2 columns]


In [17]:
# # 文字探勘前處理，將文字轉換成向量 : tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer


def vectorize(data, tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

tfidf_vect = TfidfVectorizer()  
tfidf_vect_fit = tfidf_vect.fit(df_clean['text'])
X_tfidf = vectorize(df_clean["text"], tfidf_vect_fit)
y = df["stars"]

In [12]:
import gensim
tokenized_tweet = df_clean.text.apply(lambda x: x.split())  # tokenizing
model_w2v = gensim.models.Word2Vec(
    tokenized_tweet,
    vector_size=200,  # desired no. of features/independent variables
    window=5,  # context window size
    min_count=2,  # Ignores all words with total frequency lower than 2.
    sg=1,  # 1 for skip-gram model
    hs=0,
    negative=10,  # for negative sampling
    workers=32,  # no.of cores
    seed=34
)

model_w2v.train(tokenized_tweet, total_examples=len(
    df_clean.text), epochs=20)




(11570157, 12676700)

In [13]:
model_w2v.wv.most_similar(positive="burger")

[('delux', 0.5371038913726807),
 ('rally', 0.5282623171806335),
 ('smashfries', 0.5274894833564758),
 ('unremarkable', 0.5255366563796997),
 ('charburger', 0.5223527550697327),
 ('shoestring', 0.5182007551193237),
 ('patty', 0.5118928551673889),
 ('texan', 0.5118187665939331),
 ('watercress', 0.5062969923019409),
 ('blu', 0.4975314140319824)]

In [14]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec


In [15]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200))
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i, :] = word_vector(tokenized_tweet[i], 200)
X_word2vec_df = pd.DataFrame(wordvec_arrays)


In [22]:
print(X_train)

  (0, 17130)	2
  (0, 28506)	1
  (0, 26448)	1
  (0, 16151)	1
  (0, 12364)	2
  (0, 17983)	3
  (0, 3059)	1
  (0, 10407)	1
  (0, 3643)	1
  (0, 1453)	8
  (0, 13790)	9
  (0, 28165)	8
  (0, 9339)	3
  (0, 26035)	10
  (0, 28264)	1
  (0, 19025)	1
  (0, 28411)	1
  (0, 15613)	2
  (0, 23477)	1
  (0, 18269)	1
  (0, 18342)	1
  (0, 26049)	4
  (0, 11710)	1
  (0, 1429)	1
  (0, 668)	1
  :	:
  (9999, 9776)	1
  (9999, 9700)	1
  (9999, 18088)	1
  (9999, 26468)	1
  (9999, 5113)	1
  (9999, 2196)	1
  (9999, 281)	1
  (9999, 16519)	1
  (9999, 15182)	2
  (9999, 13793)	1
  (9999, 21936)	1
  (9999, 13811)	1
  (9999, 24585)	1
  (9999, 15277)	1
  (9999, 10089)	1
  (9999, 3329)	1
  (9999, 16570)	1
  (9999, 291)	1
  (9999, 24311)	1
  (9999, 3865)	1
  (9999, 18711)	1
  (9999, 863)	1
  (9999, 14632)	1
  (9999, 8386)	1
  (9999, 1303)	1


In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
tokenizer = Tokenizer(num_words = 3800)
tokenizer.fit_on_texts(df['text'])

X_keras = tokenizer.texts_to_sequences(df['text'])
X_keras = sequence.pad_sequences(X_keras, maxlen=380)
y = df["stars"]


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
rf = RandomForestClassifier()


def K_fold_CV(k, X, y):
    #設定subset size 即data長度/k
    #設定Accuracy初始值
    X_train_folds = np.array_split(X, k)
    y_train_folds = np.array_split(y, k)
    Accuracy = 0
    Precision = 0
    Recall = 0
    F1score = 0
    for i in range(k):
        # 設定testing set與training set的資料起始點與結束點
        # 例如資料有100筆，testing set在本次iteration取第1到25筆，則training set為第26到100筆；下次testing set為26~50，training set為1~25 & 51~100
        X_test = X_train_folds[i]
        y_test = y_train_folds[i]
        X_train_k = np.concatenate(X_train_folds[:i] + X_train_folds[i+1:])
        y_train_k = np.concatenate(y_train_folds[:i] + y_train_folds[i+1:])
        # 利用training set建立模型
        rf.fit(X_train_k, y_train_k)
        y_val_pred = rf.predict(X_test)
        # testing set計算出Accuracy累加
        eachAccuracy = accuracy_score(y_test, y_val_pred)
        Accuracy += eachAccuracy
        eachPrecision = precision_score(y_test,y_val_pred)
        Precision += eachPrecision
        eachRecall = recall_score(y_test, y_val_pred)
        Recall += eachRecall
        eachF1score = f1_score(y_test,y_val_pred)
        F1score += eachF1score
    return Accuracy/k, F1score/k, Precision/k, Recall/k



accuracy_idf, f1score_idf, precision_idf, recall_idf = K_fold_CV(4,X_tfidf, y)
print("TF_IDF :4-fold_CV accuracy: {}".format(accuracy_idf))
print("TF_IDF :4-fold_CV f1score: {}".format(f1score_idf))
print("TF_IDF :4-fold_CV precision: {}".format(precision_idf))
print("TF_IDF :4-fold_CV recall: {}".format(recall_idf))
# accuracy_word2vec,f1score_word2vec, precision_word2vec, recall_word2vec = K_fold_CV(4,X_word2vec_df, y)
# print("word2Vec : 4-fold_CV accuracy: {}".format(accuracy_word2vec))
# print("word2Vec : 4-fold_CV f1score: {}".format(f1score_word2vec))
# print("word2Vec : 4-fold_CV precision: {}".format(precision_word2vec))
# print("word2Vec : 4-fold_CV recall: {}".format(recall_word2vec))
# print("word2Vec : 4-fold_CV accuracy: {}".format(accuracy_keras))/
# accuracy_keras = K_fold_CV(4,X_keras,y)
