In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb

### top 300 features

In [2]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\feature_importances.pkl"
feature_importances = pd.read_pickle(path)
feature_importances = pd.DataFrame(feature_importances.reset_index())
top300 = feature_importances['index'][:300].tolist()

In [3]:
top300[:10]

['sentiment_lexicon_score_summary',
 'sentiment_lexicon_score_review',
 'summary_embeddings_330',
 'summary_embeddings_272',
 'topic_similarity_35',
 'topic_similarity_6',
 'summary_embeddings_295',
 'review_embeddings_319',
 'summary_embeddings_300',
 'summary_embeddings_57']

### prepare dataset

In [4]:
dataset = pd.read_csv(r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\dataset.csv")
additional_features = pd.read_pickle(r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\additional_features.pkl")
sentiment_lexicon_score = pd.read_pickle(r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\sentiment_lexicon_score.pkl")
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\summary_embeddings.npy"
with open(path, 'rb') as f:
    summary_embeddings = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\review_embeddings.npy"
with open(path, 'rb') as f:
    review_embeddings = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\keyword_embeddings.npy"
with open(path, 'rb') as f:
    keyword_embeddings = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\tfidf.npy"
with open(path, 'rb') as f:
    tfidf = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\tfidf_names.npy"
with open(path, 'rb') as f:
    tfidf_names = np.load(f, allow_pickle=True)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\topic_words.npy"
with open(path, 'rb') as f:
    topic_words = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\word_scores.npy"
with open(path, 'rb') as f:
    word_scores = np.load(f)
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\topic_similarity_scores.npy"
with open(path, 'rb') as f:
    topic_similarity_scores = np.load(f)

In [5]:
dataset['response'] = dataset.sentiment.map({"POS":1, "NEU":0,"NEG":-1})
summary_embeddings_df = pd.DataFrame(summary_embeddings, columns = list(map(lambda value: f"summary_embeddings_{value}", range(summary_embeddings.shape[1]))))
review_embeddings_df = pd.DataFrame(review_embeddings, columns = list(map(lambda value: f"review_embeddings_{value}", range(review_embeddings.shape[1]))))
keyword_embeddings_df = pd.DataFrame(keyword_embeddings, columns = list(map(lambda value: f"keyword_embeddings_{value}", range(keyword_embeddings.shape[1]))))
tfidf_df = pd.DataFrame(tfidf, columns = "tfidf_"+tfidf_names)
tfidf_cols = [col for col in tfidf_df.columns if col[6:] in topic_words.flatten()]
topic_similarity_scores_df = pd.DataFrame(topic_similarity_scores, columns = list(map(lambda value: f"topic_similarity_{value}", range(topic_similarity_scores.shape[1]))))
features = pd.concat([additional_features.merge(sentiment_lexicon_score, on = "reviewId"),summary_embeddings_df,review_embeddings_df,keyword_embeddings_df,tfidf_df[tfidf_cols],topic_similarity_scores_df], axis = 1)
features = features.drop("reviewId", axis = 1)
punc_feature_names = {'punc_!': 'punc_0', 'punc_"': 'punc_1', 'punc_#': 'punc_2', 'punc_$': 'punc_3', 'punc_%': 'punc_4', 'punc_&': 'punc_5', "punc_'": 'punc_6', 'punc_(': 'punc_7', 'punc_)': 'punc_8', 'punc_*': 'punc_9', 'punc_+': 'punc_10', 'punc_,': 'punc_11', 'punc_-': 'punc_12', 'punc_.': 'punc_13', 'punc_/': 'punc_14', 'punc_:': 'punc_15', 'punc_;': 'punc_16', 'punc_<': 'punc_17', 'punc_=': 'punc_18', 'punc_>': 'punc_19', 'punc_?': 'punc_20', 'punc_@': 'punc_21', 'punc_[': 'punc_22', 'punc_\\': 'punc_23', 'punc_]': 'punc_24', 'punc_^': 'punc_25', 'punc__': 'punc_26', 'punc_`': 'punc_27', 'punc_{': 'punc_28', 'punc_|': 'punc_29', 'punc_}': 'punc_30', 'punc_~': 'punc_31'}
features.rename(columns = punc_feature_names, inplace = True)

In [6]:
X = features[top300]
y = dataset['response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X	

Unnamed: 0,sentiment_lexicon_score_summary,sentiment_lexicon_score_review,summary_embeddings_330,summary_embeddings_272,topic_similarity_35,topic_similarity_6,summary_embeddings_295,review_embeddings_319,summary_embeddings_300,summary_embeddings_57,...,summary_embeddings_130,summary_embeddings_61,summary_embeddings_217,summary_embeddings_269,summary_embeddings_241,keyword_embeddings_294,keyword_embeddings_36,keyword_embeddings_199,summary_embeddings_73,summary_embeddings_1
0,0.045455,0.053025,0.042440,0.008029,0.163936,-0.009753,0.050670,-5.386195e-08,0.017819,-0.019801,...,-0.096059,-0.016484,0.048990,0.005784,-0.095429,0.008493,0.019787,-0.004579,0.062055,-0.064515
1,0.285714,0.086788,0.052484,0.024354,0.136973,0.031459,0.001173,-5.037876e-08,0.019186,-0.035315,...,-0.060381,0.036702,0.037481,0.032187,-0.113303,-0.063210,-0.067895,0.002200,0.129004,0.009287
2,0.000000,0.105263,-0.001403,-0.006016,0.240325,0.086877,0.095317,-5.776995e-08,-0.016994,-0.012606,...,-0.040394,-0.110922,-0.050359,0.041080,-0.049910,-0.040518,-0.045201,-0.053216,0.020363,-0.101141
3,0.666667,0.058333,0.056525,-0.016757,0.178283,0.027040,-0.000302,-6.009654e-08,-0.005012,0.001925,...,0.162581,-0.044337,-0.000521,-0.019479,-0.019074,-0.011995,0.035094,-0.084640,0.043612,0.036232
4,0.400000,0.062500,0.045518,0.023466,0.198467,0.102425,0.025269,-4.335865e-08,0.030786,-0.048000,...,0.002908,-0.069371,-0.060756,0.039053,0.008632,-0.021469,0.094012,0.044318,0.071187,-0.018752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4435,0.000000,1.000000,-0.007891,-0.092562,0.029032,-0.004933,0.022041,-1.448907e-08,-0.111145,-0.056205,...,0.102875,0.003152,-0.041810,0.003943,-0.008710,-0.092448,0.015061,-0.007283,-0.020109,0.028218
4436,0.083333,0.016949,0.031087,-0.027294,0.118289,0.278527,0.030602,-4.413177e-08,-0.006407,-0.016710,...,-0.001544,-0.046202,0.009896,0.003050,-0.023742,0.009109,-0.025144,-0.046671,0.096399,-0.007330
4437,0.400000,0.127660,0.025872,-0.085862,0.222701,0.084716,0.071619,-4.731149e-08,0.144268,-0.058413,...,-0.006275,0.011016,0.072200,0.018580,-0.081254,-0.078389,0.004795,-0.034365,0.075592,0.018067
4438,0.000000,0.139706,0.029618,0.073468,0.110183,0.055998,0.013738,-4.689916e-08,-0.040872,0.026115,...,0.025791,-0.006235,0.047045,-0.062403,0.047743,-0.015030,0.013991,-0.007235,0.122253,-0.049531


### model training

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier

In [68]:
models = ["Naive Bayes","SVM", "Extra Trees", "Random Forest", "LightGBM"]

In [11]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

In [61]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))

In [14]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))

In [64]:
etc_model = ExtraTreesClassifier()
etc_model.fit(X_train, y_train)
y_pred = etc_model.predict(X_test)
accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))

In [65]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)
accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))

In [67]:
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train,y_train, eval_set = (X_test, y_test), early_stopping_rounds = 10)
y_pred = lgb_model.predict(X_test)
accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))



[1]	valid_0's multi_logloss: 1.05006
[2]	valid_0's multi_logloss: 1.00876
[3]	valid_0's multi_logloss: 0.974248
[4]	valid_0's multi_logloss: 0.945331
[5]	valid_0's multi_logloss: 0.921384
[6]	valid_0's multi_logloss: 0.900344
[7]	valid_0's multi_logloss: 0.881848
[8]	valid_0's multi_logloss: 0.862436
[9]	valid_0's multi_logloss: 0.848086
[10]	valid_0's multi_logloss: 0.830149
[11]	valid_0's multi_logloss: 0.816026
[12]	valid_0's multi_logloss: 0.803636
[13]	valid_0's multi_logloss: 0.792211
[14]	valid_0's multi_logloss: 0.782818
[15]	valid_0's multi_logloss: 0.771553
[16]	valid_0's multi_logloss: 0.763342
[17]	valid_0's multi_logloss: 0.756085
[18]	valid_0's multi_logloss: 0.747924
[19]	valid_0's multi_logloss: 0.741794
[20]	valid_0's multi_logloss: 0.736829
[21]	valid_0's multi_logloss: 0.732257
[22]	valid_0's multi_logloss: 0.727119
[23]	valid_0's multi_logloss: 0.721609
[24]	valid_0's multi_logloss: 0.716983
[25]	valid_0's multi_logloss: 0.713638
[26]	valid_0's multi_logloss: 0.7103

In [8]:
params = {
    "n_estimators":1000,
    'num_leaves': 163,
    'bagging_fraction': 0.80,
    'lambda_l1': 0.00019163693019842484,
    'lambda_l2': 0.0027009967266884796,
    'feature_fraction': 0.60,
    'boosting': 'gbdt',
    "learning_rate":0.01,
    "objective": "multiclass",
    "max_depth":  -1,
    "bagging_freq": 5 ,                  # resamples rows at every k-th iteration
    "force_col_wise":  True  ,                 # reduce memory cost
}

In [9]:
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train,y_train, eval_set = (X_test, y_test), early_stopping_rounds = 10)
y_pred = lgb_model.predict(X_test)
# accuracy_scores.append(metrics.accuracy_score(y_test,y_pred))
# precision_scores.append (metrics.precision_score(y_test,y_pred, average = "macro"))
# recall_scores.append(metrics.recall_score(y_test,y_pred, average = "macro"))
# f1_scores.append (metrics.f1_score(y_test,y_pred, average = "macro"))



[1]	valid_0's multi_logloss: 1.09388
[2]	valid_0's multi_logloss: 1.08866
[3]	valid_0's multi_logloss: 1.08344
[4]	valid_0's multi_logloss: 1.07775
[5]	valid_0's multi_logloss: 1.07243
[6]	valid_0's multi_logloss: 1.06746
[7]	valid_0's multi_logloss: 1.06249
[8]	valid_0's multi_logloss: 1.05804
[9]	valid_0's multi_logloss: 1.05317
[10]	valid_0's multi_logloss: 1.04827
[11]	valid_0's multi_logloss: 1.04349
[12]	valid_0's multi_logloss: 1.03891
[13]	valid_0's multi_logloss: 1.03413
[14]	valid_0's multi_logloss: 1.02944
[15]	valid_0's multi_logloss: 1.02465
[16]	valid_0's multi_logloss: 1.02041
[17]	valid_0's multi_logloss: 1.01607
[18]	valid_0's multi_logloss: 1.01162
[19]	valid_0's multi_logloss: 1.00747
[20]	valid_0's multi_logloss: 1.00369
[21]	valid_0's multi_logloss: 0.999182
[22]	valid_0's multi_logloss: 0.995012
[23]	valid_0's multi_logloss: 0.990903
[24]	valid_0's multi_logloss: 0.986919
[25]	valid_0's multi_logloss: 0.982972
[26]	valid_0's multi_logloss: 0.979094
[27]	valid_0's 

[195]	valid_0's multi_logloss: 0.706859
[196]	valid_0's multi_logloss: 0.706127
[197]	valid_0's multi_logloss: 0.705416
[198]	valid_0's multi_logloss: 0.704591
[199]	valid_0's multi_logloss: 0.703892
[200]	valid_0's multi_logloss: 0.703302
[201]	valid_0's multi_logloss: 0.70259
[202]	valid_0's multi_logloss: 0.702033
[203]	valid_0's multi_logloss: 0.701389
[204]	valid_0's multi_logloss: 0.700742
[205]	valid_0's multi_logloss: 0.70013
[206]	valid_0's multi_logloss: 0.699344
[207]	valid_0's multi_logloss: 0.698686
[208]	valid_0's multi_logloss: 0.697817
[209]	valid_0's multi_logloss: 0.697205
[210]	valid_0's multi_logloss: 0.696597
[211]	valid_0's multi_logloss: 0.695918
[212]	valid_0's multi_logloss: 0.695209
[213]	valid_0's multi_logloss: 0.694554
[214]	valid_0's multi_logloss: 0.693842
[215]	valid_0's multi_logloss: 0.69313
[216]	valid_0's multi_logloss: 0.692691
[217]	valid_0's multi_logloss: 0.69207
[218]	valid_0's multi_logloss: 0.691443
[219]	valid_0's multi_logloss: 0.690887
[220

[401]	valid_0's multi_logloss: 0.634803
[402]	valid_0's multi_logloss: 0.634781
[403]	valid_0's multi_logloss: 0.634636
[404]	valid_0's multi_logloss: 0.634495
[405]	valid_0's multi_logloss: 0.634516
[406]	valid_0's multi_logloss: 0.6344
[407]	valid_0's multi_logloss: 0.634047
[408]	valid_0's multi_logloss: 0.63385
[409]	valid_0's multi_logloss: 0.633572
[410]	valid_0's multi_logloss: 0.633437
[411]	valid_0's multi_logloss: 0.633374
[412]	valid_0's multi_logloss: 0.633134
[413]	valid_0's multi_logloss: 0.632971
[414]	valid_0's multi_logloss: 0.632859
[415]	valid_0's multi_logloss: 0.632724
[416]	valid_0's multi_logloss: 0.632547
[417]	valid_0's multi_logloss: 0.632259
[418]	valid_0's multi_logloss: 0.632131
[419]	valid_0's multi_logloss: 0.631994
[420]	valid_0's multi_logloss: 0.631864
[421]	valid_0's multi_logloss: 0.631715
[422]	valid_0's multi_logloss: 0.631663
[423]	valid_0's multi_logloss: 0.631545
[424]	valid_0's multi_logloss: 0.631554
[425]	valid_0's multi_logloss: 0.631573
[42

In [17]:
path = r"C:\Users\tanch\Documents\NTU\NTU Year 4\Semester 1\CS4022 - Social Media Mining\Assignments\Assignment 2 - Customer Review Mining Project\data\predictions.csv"

In [18]:
pd.DataFrame({"true" : list(map(lambda value: {1: 'POS', 0: 'NEU', -1: 'NEG'}.get(value), y_test)),
"pred" : list(map(lambda value: {1: 'POS', 0: 'NEU', -1: 'NEG'}.get(value), y_pred))})

In [None]:
X = features[["sentiment_lexicon_score_summary", "sentiment_lexicon_score_review"]]
y = dataset['response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
tmp = pd.DataFrame({"true" : list(map(lambda value: {1: 'POS', 0: 'NEU', -1: 'NEG'}.get(value), y_test)),
"pred" : list(map(lambda value: {1: 'POS', 0: 'NEU', -1: 'NEG'}.get(value), y_pred)),
"index":y_test.index})

In [41]:
tmp[tmp.apply(lambda row: row["true"]=="NEU" and row["pred"]=="POS", axis = 1)]

Unnamed: 0,true,pred,index
27,NEU,POS,1621
36,NEU,POS,2646
69,NEU,POS,2886
71,NEU,POS,1747
85,NEU,POS,721
...,...,...,...
1269,NEU,POS,2389
1302,NEU,POS,2379
1308,NEU,POS,4122
1320,NEU,POS,2992


In [40]:
dataset.iloc[1378].to_dict()

{'summary': "I'm a bit leery",
 'reviewText': 'I\'m a bit leery of this product. I\'ve been a Norton customer since it first started providing security services. The product detail says 25GB of Online Backup, but I receive a 5GB Online Back Up card via Amazon Vine. Where my skepticism comes in is the product is an annual product like all other Norton products. In addition to this product being comparible to services like DropBox, Box, and SkyDrive, I looked through the TOS and the product paperwork that came in the package for what happens to my back ups provided I do NOT renew this service on an annual basis. Do I lose access to what I\'ve already backed up? If this is the case, this could be detrimental and I would suffer a loss of my personal files. I currently have a ticket into Norton to see what happens to my files provided I do not renew. I will post their response when I receive an answer, but I am not going to back up anything until I know the answer. One would thing that this

In [42]:
dataset.iloc[1621].to_dict()

{'summary': 'Easy download - good program',
 'reviewText': "Used H&R Block tax software for the first time last year - it was cheaper than what I had been using.Since they didn't constantly bug me to buy it again this year, and it worked well last year, I purchased again.I'm very glad that I'm not getting constant pleas to buy their software.  And, it loaded last year's info just fine!I haven't efiled, yet - but that went well last year & I'm expecting it to go fine this year, too.Update:  E-filing is great  Having the name of tax professionals behind it is comforting  But, downgraded to 3-stars because I had difficulty understanding some tax issues. Had to resort to the H&R block website (which was very good, and has helpful comments).  But still, if links to IRS rules were available for each item I wouldn't have had to search as much.",
 'asin': 'B004A7Y0UK',
 'brand': 'H&R Block',
 'sentiment': 'NEU',
 'reviewId': 1622,
 'response': 0}