# DSE220 Final Model Analysis Notebook

In [1]:
import gzip
from collections import defaultdict, Counter
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import pca
from sklearn.metrics import mean_absolute_error
%matplotlib inline

### read files

In [2]:
# read original files
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

orig_train_df = getDF('train.json.gz')
orig_test_df = getDF('test_Helpful.json.gz')

In [3]:
# split outOf and nHelpful out as separate columns
train_outOf = np.array([x['outOf'] for x in orig_train_df.helpful.values])
train_nHelpful = np.array([x['nHelpful'] for x in orig_train_df.helpful.values])
orig_train_df['outOf'] = train_outOf
orig_train_df['nHelpful'] = train_nHelpful
orig_train_df['pctHelpful'] = train_nHelpful / train_outOf

test_outOf = np.array([x['outOf'] for x in orig_test_df.helpful.values])
orig_test_df['outOf'] = test_outOf



In [4]:
# read preprocessed csv files
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')
test = pd.read_csv('test.csv')

In [5]:
print(train.shape)
print(val.shape)
print(test.shape)

(50529, 229)
(12486, 229)
(4400, 227)


###### prepare files to use for modeling

In [6]:
X_train_orig, X_val_orig = train_test_split(orig_train_df, test_size=0.20, random_state=55)
print(X_train_orig.shape, X_val_orig.shape, orig_test_df.shape)

X_train_sub_template = X_train_orig.loc[:,['reviewerID','itemID']]
X_val_sub_template = X_val_orig.loc[:,['reviewerID','itemID']]
X_test_sub_template = orig_test_df.loc[:,['reviewerID','itemID']]
print(X_train_sub_template.shape, X_val_sub_template.shape, X_test_sub_template.shape)

(160000, 15) (40000, 15) (14000, 13)
(160000, 2) (40000, 2) (14000, 2)


In [7]:
# prepare train dataset prediction dataframe
X_train_pred = train.loc[:,['reviewerID','itemID','outOf','nHelpful']]

# extract info
y_train_pctHelpful = train.loc[:,'pctHelpful']
y_train_nHelpful = train.loc[:,'nHelpful']
y_train_outOf = train.loc[:,'outOf']

# drop categorical columns
X_train_num = train.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)

In [8]:
# prepare validation dataset predicion dataframe
X_val_pred = val.loc[:,['reviewerID','itemID','outOf','nHelpful']]

# extract info
y_val_pctHelpful = val.loc[:,'pctHelpful']
y_val_nHelpful = val.loc[:,'nHelpful']
y_val_outOf = val.loc[:,'outOf']

# drop categorical columns and answers
X_val_num = val.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)

In [9]:
# prepare testidation dataset predicion dataframe
X_test_pred = test.loc[:,['reviewerID','itemID','outOf']]

# extract info
y_test_outOf = test.loc[:,'outOf']

# drop categorical columns
X_test_num = test.drop(labels=['reviewerID','itemID'], axis=1)

In [10]:
# check shapes
print(orig_train_df.shape, orig_test_df.shape)
print(X_train_sub_template.shape, X_val_sub_template.shape, X_test_sub_template.shape)

print(train.shape, val.shape, test.shape)
print(X_train_num.shape, X_val_num.shape, X_test_num.shape)
print(X_train_pred.shape, X_val_pred.shape, X_test_pred.shape)
print(y_train_pctHelpful.shape, y_train_nHelpful.shape, y_train_outOf.shape)
print(y_val_pctHelpful.shape, y_val_nHelpful.shape, y_val_outOf.shape)
print(y_test_outOf.shape)

(200000, 15) (14000, 13)
(160000, 2) (40000, 2) (14000, 2)
(50529, 229) (12486, 229) (4400, 227)
(50529, 225) (12486, 225) (4400, 225)
(50529, 4) (12486, 4) (4400, 3)
(50529,) (50529,) (50529,)
(12486,) (12486,) (12486,)
(4400,)


##### ElasticNet regression model

In [11]:
from sklearn import linear_model

In [12]:
best_err = 99
best_a = 0.1
for a in [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,1.0]:
    el = linear_model.ElasticNet(alpha=a, l1_ratio=0.01, fit_intercept=True)

    el.fit(X_train_num, y_train_pctHelpful)

    val_pred_pctHelpful = el.predict(X_val_num)

    val_pred_pctHelpful[val_pred_pctHelpful < 0.0] = 0.0
    val_pred_pctHelpful[val_pred_pctHelpful > 1.0] = 1.0

    # convert predictions to nHelpful predictions
    val_pred_nHelpful = val_pred_pctHelpful * y_val_outOf.values

    # training errors
    err = (mean_absolute_error(val_pred_nHelpful.round(0), y_val_nHelpful.values))
    if err < best_err:
        best_err = err
        best_a = a

print(best_a, best_err)

0.25 0.641198141919


###### split regression experiment

In [13]:
best_mae = 99 #0.606599391318 #0.604629841852 # 0.604859041944
best_thresh = 15 #23 #22

for thresh in [15, 18, 19, 20, 21,22,23,24,25,26,27,28,29,35,40,50]:
    y_val_pctHelpful = val.loc[:,'pctHelpful']
    y_val_nHelpful = val.loc[:,'nHelpful']
    y_val_outOf = val.loc[:,'outOf']
    
    # split data
    train_1 = train.loc[train['outOf']<thresh,:]
    train_2 = train.loc[train['outOf']>=thresh,:]
    val_1 = val.loc[val['outOf']<thresh,:]
    val_2 = val.loc[val['outOf']>=thresh,:]
    
    # extract info needed to evaluate mae
    y_train_1_pctHelpful = train_1.loc[:,'pctHelpful']
    y_train_1_nHelpful = train_1.loc[:,'nHelpful']
    y_train_1_outOf = train_1.loc[:,'outOf']
    y_train_2_pctHelpful = train_2.loc[:,'pctHelpful']
    y_train_2_nHelpful = train_2.loc[:,'nHelpful']
    y_train_2_outOf = train_2.loc[:,'outOf']

    y_val_1_pctHelpful = val_1.loc[:,'pctHelpful']
    y_val_1_nHelpful = val_1.loc[:,'nHelpful']
    y_val_1_outOf = val_1.loc[:,'outOf']
    y_val_2_pctHelpful = val_2.loc[:,'pctHelpful']
    y_val_2_nHelpful = val_2.loc[:,'nHelpful']
    y_val_2_outOf = val_2.loc[:,'outOf']

    # drop categorical columns
    X_train_num_1 = train_1.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)
    X_train_num_2 = train_2.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)

    X_val_num_1 = val_1.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)
    X_val_num_2 = val_2.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)
    
    # regression models
    el1 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
    el1.fit(X_train_num_1, y_train_1_pctHelpful)
    val_1_pred_pctHelpful = el1.predict(X_val_num_1)
    val_1_pred_pctHelpful[val_1_pred_pctHelpful < 0.0] = 0.0
    val_1_pred_pctHelpful[val_1_pred_pctHelpful > 1.0] = 1.0

    el2 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
    el2.fit(X_train_num_2, y_train_2_pctHelpful)
    val_2_pred_pctHelpful = el1.predict(X_val_num_2)
    val_2_pred_pctHelpful[val_2_pred_pctHelpful < 0.0] = 0.0
    val_2_pred_pctHelpful[val_2_pred_pctHelpful > 1.0] = 1.0
    
    # convert predictions to nHelpful predictions
    #train_pred_nHelpful = train_pred_pctHelpful * y_train_outOf.values
    val_1_pred_nHelpful = val_1_pred_pctHelpful * y_val_1_outOf.values
    val_2_pred_nHelpful = val_2_pred_pctHelpful * y_val_2_outOf.values
    
    # add predictions back to val_dfs
    val_1['prediction'] = val_1_pred_nHelpful.round(0)
    val_2['prediction'] = val_2_pred_nHelpful.round(0)
    
    # concatenate results
    val_results = pd.concat([val_1, val_2])

    # val errors
    err = mean_absolute_error(val_results.prediction, val_results.nHelpful)
    err1 = mean_absolute_error(val_1.prediction, val_1.nHelpful)
    err2 = mean_absolute_error(val_2.prediction, val_2.nHelpful)
    if err <= best_mae:
        best_mae = err
        best_thresh = thresh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
print(best_thresh, best_mae)

21 0.606679481019


###### regression model ablation experiment

In [15]:
thresh = best_thresh

# split data according to best split of outOf
y_val_pctHelpful = val.loc[:,'pctHelpful']
y_val_nHelpful = val.loc[:,'nHelpful']
y_val_outOf = val.loc[:,'outOf']

# split data
train_1 = train.loc[train['outOf']<thresh,:]
train_2 = train.loc[train['outOf']>=thresh,:]
val_1 = val.loc[val['outOf']<thresh,:]
val_2 = val.loc[val['outOf']>=thresh,:]

# extract info needed to evaluate mae
y_train_1_pctHelpful = train_1.loc[:,'pctHelpful']
y_train_1_nHelpful = train_1.loc[:,'nHelpful']
y_train_1_outOf = train_1.loc[:,'outOf']
y_train_2_pctHelpful = train_2.loc[:,'pctHelpful']
y_train_2_nHelpful = train_2.loc[:,'nHelpful']
y_train_2_outOf = train_2.loc[:,'outOf']

y_val_1_pctHelpful = val_1.loc[:,'pctHelpful']
y_val_1_nHelpful = val_1.loc[:,'nHelpful']
y_val_1_outOf = val_1.loc[:,'outOf']
y_val_2_pctHelpful = val_2.loc[:,'pctHelpful']
y_val_2_nHelpful = val_2.loc[:,'nHelpful']
y_val_2_outOf = val_2.loc[:,'outOf']

# drop categorical columns
X_train_num_1 = train_1.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)
X_train_num_2 = train_2.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)

X_val_num_1 = val_1.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)
X_val_num_2 = val_2.drop(labels=['reviewerID','itemID','nHelpful','pctHelpful'], axis=1)

In [16]:
# get list of features in our dataset
features = X_train_num.columns.values

# change below to run on different subsets of data split by 'outOf'
X_train_temp = X_train_num_1
X_val_temp = X_val_num_1

y_train_pctHelpful = y_train_1_pctHelpful
y_val_pctHelpful = y_val_1_pctHelpful
y_val_nHelpful = y_val_1_nHelpful
y_val_outOf = y_val_1_outOf

cur_best_mae = 100.0
removed_features_list_1 = []
best_maes = []

for i in range(len(features)):
    print(i)
    mae = []
    
    feat = X_train_temp.columns.values
    for f in feat:
        X_train_abl = X_train_temp.drop(labels=[f],axis=1)
        X_val_abl = X_val_temp.drop(labels=[f],axis=1)
        
        # build regression model without feature
        el1 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
        el1.fit(X_train_abl, y_train_pctHelpful)
        val_pred_pctHelpful = el1.predict(X_val_abl)
        val_pred_pctHelpful[val_pred_pctHelpful < 0.0] = 0.0
        val_pred_pctHelpful[val_pred_pctHelpful > 1.0] = 1.0

        # convert predictions to nHelpful predictions
        val_pred_nHelpful = val_pred_pctHelpful * y_val_outOf.values
        
        # val errors
        mae.append(mean_absolute_error(val_pred_nHelpful.round(0), y_val_nHelpful.values))

    # find best feature to remove
    min_mae = np.min(np.array(mae))
    min_mae_feat = feat[np.where(mae == min_mae)[0]][0]
    
    # if removing feature improves current best error
    if min_mae <= cur_best_mae:
        # update current best error
        cur_best_mae = min_mae
        best_maes.append(min_mae)
        
        # add feature to removed features list
        removed_features_list_1.append(min_mae_feat)
        
        # update X_train_temp and X_val_temp by dropping feature
        X_train_temp = X_train_temp.drop(labels=[min_mae_feat],axis=1)
        X_val_temp = X_val_temp.drop(labels=[min_mae_feat],axis=1)
    else:
        break

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213


In [17]:
# write removed features to csv file for reading in later
rf1 = pd.DataFrame(removed_features_list_1)
rf1.to_csv('removed_features_list_1.csv')

print(cur_best_mae)
removed_features_list_1

0.489175342919


['avgUserHelpfulness',
 'itemRatedReviews',
 'category_Men',
 'reviewTextNumWords',
 'unixReviewTime',
 'category_Plus-Size',
 'category_Athletic',
 'pctUserReviewsRated',
 'pctItemReviewsRated',
 'numCategories',
 'category_Clothing, Shoes & Jewelry',
 'category_Clothing',
 'category_Shoes',
 'category_Shoes & Accessories: International Shipping Available',
 'category_Novelty, Costumes & More',
 'category_Novelty',
 'category_Lingerie, Sleep & Lounge',
 'category_Jewelry',
 'category_Intimates',
 'category_Accessories',
 'category_Sports & Outdoors',
 'category_Comfort Shoes',
 'category_Tops & Tees',
 'category_Active',
 'category_Bras',
 'category_Jewelry: International Shipping Available',
 'category_Sandals',
 'category_C',
 'category_Big & Tall',
 'category_Boots',
 'category_Dresses',
 'category_Everyday Bras',
 'category_Casual',
 'category_Watches',
 'category_Wrist Watches',
 'category_Fashion',
 'category_Boot Shop',
 'category_Earrings',
 'category_Blouses & Button-Down Shi

In [18]:
# get list of features in our dataset
features = X_train_num.columns.values

# change below to run on different subsets of data split by 'outOf'
X_train_temp = X_train_num_2
X_val_temp = X_val_num_2

y_train_pctHelpful = y_train_2_pctHelpful
y_val_pctHelpful = y_val_2_pctHelpful
y_val_nHelpful = y_val_2_nHelpful
y_val_outOf = y_val_2_outOf

cur_best_mae = 100.0
removed_features_list_2 = []
best_maes = []

for i in range(len(features)):
    print(i)
    mae = []
    
    feat = X_train_temp.columns.values
    for f in feat:
        X_train_abl = X_train_temp.drop(labels=[f],axis=1)
        X_val_abl = X_val_temp.drop(labels=[f],axis=1)
        
        # build regression model without feature
        el1 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
        el1.fit(X_train_abl, y_train_pctHelpful)
        val_pred_pctHelpful = el1.predict(X_val_abl)
        val_pred_pctHelpful[val_pred_pctHelpful < 0.0] = 0.0
        val_pred_pctHelpful[val_pred_pctHelpful > 1.0] = 1.0

        # convert predictions to nHelpful predictions
        val_pred_nHelpful = val_pred_pctHelpful * y_val_outOf.values
        
        # val errors
        mae.append(mean_absolute_error(val_pred_nHelpful.round(0), y_val_nHelpful.values))

    # find best feature to remove
    min_mae = np.min(np.array(mae))
    min_mae_feat = feat[np.where(mae == min_mae)[0]][0]
    
    # if removing feature improves current best error
    if min_mae <= cur_best_mae:
        # update current best error
        cur_best_mae = min_mae
        best_maes.append(min_mae)
        
        # add feature to removed features list
        removed_features_list_2.append(min_mae_feat)
        
        # update X_train_temp and X_val_temp by dropping feature
        X_train_temp = X_train_temp.drop(labels=[min_mae_feat],axis=1)
        X_val_temp = X_val_temp.drop(labels=[min_mae_feat],axis=1)
    else:
        break

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216


In [19]:
# write list to csv
rf2 = pd.DataFrame(removed_features_list_2)
rf2.to_csv('removed_features_list_2.csv')

print(cur_best_mae)
removed_features_list_2

3.3515625


['reviewTextFKscore',
 'reviewTextFscore',
 'avgItemHelpfulness',
 'reviewTextNumSents',
 'reviewTextNumWords',
 'category_Men',
 'userTotReviews',
 'userRatedReviews',
 'pctUserReviewsRated',
 'pctItemReviewsRated',
 'numCategories',
 'category_Clothing, Shoes & Jewelry',
 'category_Clothing',
 'category_Shoes',
 'category_Shoes & Accessories: International Shipping Available',
 'category_Novelty, Costumes & More',
 'category_Novelty',
 'category_Petite',
 'category_Lingerie, Sleep & Lounge',
 'category_Jewelry',
 'category_Intimates',
 'category_Accessories',
 'category_Sports & Outdoors',
 'category_Comfort Shoes',
 'category_Tops & Tees',
 'category_Plus-Size',
 'category_Active',
 'category_Athletic',
 'category_Bras',
 'category_Jewelry: International Shipping Available',
 'category_Sandals',
 'category_C',
 'category_Big & Tall',
 'category_Boots',
 'category_Dresses',
 'category_Everyday Bras',
 'category_Casual',
 'category_Watches',
 'category_Wrist Watches',
 'category_Fashi

###### create split regression model after removing ablated features

In [20]:
removed_feat_1 = removed_features_list_1
removed_feat_2 = removed_features_list_2

###### Regression model for split data with removed features

In [21]:
# remove ablated features from each split of data
X_train_abl_1 = train_1.drop(labels=removed_feat_1, axis=1)
X_val_abl_1 = val_1.drop(labels=removed_feat_1, axis=1)
X_train_abl_2 = train_2.drop(labels=removed_feat_2, axis=1)
X_val_abl_2 = val_2.drop(labels=removed_feat_2, axis=1)

# drop categorical inputs
X_train_abl_1_num = X_train_abl_1.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)
X_train_abl_2_num = X_train_abl_2.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)
X_val_abl_1_num = X_val_abl_1.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)
X_val_abl_2_num = X_val_abl_2.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)

# fit elasticnet model
el1 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
el2 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)

el1.fit(X_train_abl_1_num, y_train_1_pctHelpful)
el2.fit(X_train_abl_2_num, y_train_2_pctHelpful)

val_1_pred_pctHelpful = el1.predict(X_val_abl_1_num)
val_1_pred_pctHelpful[val_pred_pctHelpful < 0.0] = 0.0
val_1_pred_pctHelpful[val_pred_pctHelpful > 1.0] = 1.0

val_2_pred_pctHelpful = el2.predict(X_val_abl_2_num)
val_2_pred_pctHelpful[val_pred_pctHelpful < 0.0] = 0.0
val_2_pred_pctHelpful[val_pred_pctHelpful > 1.0] = 1.0

# convert predictions to nHelpful predictions
val_1_pred_nHelpful = val_1_pred_pctHelpful * y_val_1_outOf.values
val_2_pred_nHelpful = val_2_pred_pctHelpful * y_val_2_outOf.values



In [22]:
val_1_pred = val_1.loc[:,['reviewerID','itemID','outOf','nHelpful']]
val_1_pred['prediction'] = val_1_pred_nHelpful

val_2_pred = val_2.loc[:,['reviewerID','itemID','outOf','nHelpful']]
val_2_pred['prediction'] = val_2_pred_nHelpful

val_pred = pd.concat([val_1_pred, val_2_pred])

val_pred.shape

(12486, 5)

In [23]:
X_val_submission = pd.merge(X_val_sub_template, val_pred, how='left', on=['reviewerID','itemID'])

In [24]:
X_val_submission.head()

Unnamed: 0,reviewerID,itemID,outOf,nHelpful,prediction
0,U232545388,I919490106,1.0,1.0,0.770983
1,U629278857,I317532014,,,
2,U772466163,I022571677,,,
3,U649013417,I313758331,,,
4,U709855261,I128829618,1.0,1.0,0.83408


In [25]:
X_val_submission.fillna(0,inplace=True)

In [41]:
print (mean_absolute_error(X_val_submission.nHelpful.values, X_val_submission.prediction.values))
print (mean_absolute_error(X_val_submission.nHelpful.values, X_val_submission.prediction.values.round(0)))

0.204213157693
0.1807


###### train regression model using combined training + validation data

In [27]:
# remove outliers
outlier_train_idx_t1 = train_1.loc[(train_1.outOf > 80) & (train_1.pctHelpful < 0.2),:].index.values
train_1 = train_1.drop(outlier_train_idx_t1)
outlier_train_idx_t2 = train_2.loc[(train_2.outOf > 80) & (train_2.pctHelpful < 0.2),:].index.values
train_2 = train_2.drop(outlier_train_idx_t2)

outlier_val_idx_v1 = val_1.loc[(val_1.outOf > 80) & (val_1.pctHelpful < 0.2),:].index.values
val_1 = val_1.drop(outlier_val_idx_v1)
outlier_val_idx_v2 = val_2.loc[(val_2.outOf > 80) & (val_2.pctHelpful < 0.2),:].index.values
val_2 = val_2.drop(outlier_val_idx_v2)

# extract info needed for prediction
y_train_1_pctHelpful = train_1.loc[:,'pctHelpful']
y_train_1_nHelpful = train_1.loc[:,'nHelpful']
y_train_1_outOf = train_1.loc[:,'outOf']
y_train_2_pctHelpful = train_2.loc[:,'pctHelpful']
y_train_2_nHelpful = train_2.loc[:,'nHelpful']
y_train_2_outOf = train_2.loc[:,'outOf']

y_val_1_pctHelpful = val_1.loc[:,'pctHelpful']
y_val_1_nHelpful = val_1.loc[:,'nHelpful']
y_val_1_outOf = val_1.loc[:,'outOf']
y_val_2_pctHelpful = val_2.loc[:,'pctHelpful']
y_val_2_nHelpful = val_2.loc[:,'nHelpful']
y_val_2_outOf = val_2.loc[:,'outOf']

# remove ablated features from each split of data
X_train_abl_1 = train_1.drop(labels=removed_feat_1, axis=1)
X_val_abl_1 = val_1.drop(labels=removed_feat_1, axis=1)
X_train_abl_2 = train_2.drop(labels=removed_feat_2, axis=1)
X_val_abl_2 = val_2.drop(labels=removed_feat_2, axis=1)

# train model using all data
X_both_1 = pd.concat([X_train_abl_1, X_val_abl_1])
X_both_2 = pd.concat([X_train_abl_2, X_val_abl_2])

y_both_1_pctHelpful = np.concatenate((y_train_1_pctHelpful,y_val_1_pctHelpful))
y_both_2_pctHelpful = np.concatenate((y_train_2_pctHelpful,y_val_2_pctHelpful))

# drop categorical features
X_both_1_num = X_both_1.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)
X_both_2_num = X_both_2.drop(labels=['itemID','reviewerID','nHelpful','pctHelpful'], axis=1)

# fit elasticnet model
el1 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)
el2 = linear_model.ElasticNet(alpha=best_a, l1_ratio=0.01, fit_intercept=True)

el1.fit(X_both_1_num, y_both_1_pctHelpful)
el2.fit(X_both_2_num, y_both_2_pctHelpful)

# split test data
test_1 = test.loc[test['outOf']<thresh,:]
test_2 = test.loc[test['outOf']>=thresh,:]

# extract data needed for prediction
y_test_1_outOf = test_1.loc[:,'outOf']
y_test_2_outOf = test_2.loc[:,'outOf']

# remove ablated features
test_1 = test_1.drop(labels=removed_feat_1, axis=1)
test_2 = test_2.drop(labels=removed_feat_2, axis=1)

# drop categorical columns
X_test_num_1 = test_1.drop(labels=['reviewerID','itemID'], axis=1)
X_test_num_2 = test_2.drop(labels=['reviewerID','itemID'], axis=1)

# make predictions on test data
test_1_pred_pctHelpful = el1.predict(X_test_num_1)
test_1_pred_pctHelpful[test_1_pred_pctHelpful < 0.0] = 0.0
test_1_pred_pctHelpful[test_1_pred_pctHelpful > 1.0] = 1.0

test_2_pred_pctHelpful = el2.predict(X_test_num_2)
test_2_pred_pctHelpful[test_2_pred_pctHelpful < 0.0] = 0.0
test_2_pred_pctHelpful[test_2_pred_pctHelpful > 1.0] = 1.0

# convert predictions to nHelpful predictions
test_1_pred_nHelpful = test_1_pred_pctHelpful * y_test_1_outOf.values
test_2_pred_nHelpful = test_2_pred_pctHelpful * y_test_2_outOf.values

test_1_pred = test_1.loc[:,['reviewerID','itemID','outOf']]
test_1_pred['prediction'] = test_1_pred_nHelpful.round(0)

test_2_pred = test_2.loc[:,['reviewerID','itemID','outOf']]
test_2_pred['prediction'] = test_2_pred_nHelpful.round(0)

X_test_pred = pd.concat([test_1_pred, test_2_pred])

X_test_reg_submission_el = pd.merge(X_test_sub_template, X_test_pred, how='left', on=['reviewerID','itemID'])
X_test_reg_submission_el.fillna(0, inplace=True)

In [28]:
# write elasticnet regression results to file
predictions = open("predictions_Helpful_el0611.txt", 'w')
for l in open("pairs_Helpful.txt"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,i,outOf = l.strip().split('-')
  outOf = int(outOf)
  
  pred = X_test_reg_submission_el.loc[(X_test_reg_submission_el['reviewerID']==u) &
                                  (X_test_reg_submission_el['itemID']==i)].prediction.values[0]
  predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(pred) + '\n')

predictions.close()

# Scratch area

In [34]:
print(X_train_abl_1_num.shape, X_train_abl_2_num.shape, X_train_num.shape)

(49046, 12) (1483, 9) (50529, 225)


In [29]:
X_train_abl_1.head()

Unnamed: 0,itemID,reviewerID,rating,price,outOf,nHelpful,pctHelpful,userTotReviews,itemTotReviews,avgItemHelpfulness,userRatedReviews,category_Women,category_Petite,reviewTextNumSents,reviewTextFscore,reviewTextFKscore
0,I500768895,U021132998,4.0,-1.0,4,4,1.0,17,13,0.8,9,1.0,0.0,4,76.005682,6.757121
1,I243311479,U482369528,4.0,9.99,2,2,1.0,3,7,1.0,2,1.0,0.0,4,70.63,4.65
2,I496891609,U737837172,5.0,19.99,7,7,1.0,7,23,0.958333,5,1.0,0.0,4,91.434643,3.114524
3,I725702722,U866103305,5.0,-1.0,2,1,0.5,1,5,0.5,1,0.0,0.0,7,84.032353,4.555126
4,I716842709,U701505038,5.0,10.0,1,1,1.0,4,32,0.5,3,0.0,0.0,3,83.598333,6.070741


In [30]:
X_train_abl_2.head()

Unnamed: 0,itemID,reviewerID,rating,unixReviewTime,price,outOf,nHelpful,pctHelpful,itemTotReviews,avgUserHelpfulness,itemRatedReviews,category_Women,summaryTextNumWords
43,I134326011,U100860173,5.0,1231632000,53.04,21,19,0.904762,4,0.891534,2,0.0,4
53,I486821863,U071062576,3.0,1330992000,-1.0,25,24,0.96,12,0.970149,11,1.0,1
56,I133591235,U233308558,2.0,1390521600,-1.0,23,22,0.956522,18,0.909091,8,1.0,6
80,I399679500,U373444291,4.0,1260835200,39.48,51,45,0.882353,10,0.882353,3,0.0,21
136,I360662505,U248389958,3.0,1388016000,-1.0,40,39,0.975,12,0.98913,10,1.0,5


In [31]:
X_train_abl_1_num.head()

Unnamed: 0,rating,price,outOf,userTotReviews,itemTotReviews,avgItemHelpfulness,userRatedReviews,category_Women,category_Petite,reviewTextNumSents,reviewTextFscore,reviewTextFKscore
0,4.0,-1.0,4,17,13,0.8,9,1.0,0.0,4,76.005682,6.757121
1,4.0,9.99,2,3,7,1.0,2,1.0,0.0,4,70.63,4.65
2,5.0,19.99,7,7,23,0.958333,5,1.0,0.0,4,91.434643,3.114524
3,5.0,-1.0,2,1,5,0.5,1,0.0,0.0,7,84.032353,4.555126
4,5.0,10.0,1,4,32,0.5,3,0.0,0.0,3,83.598333,6.070741


In [32]:
X_train_abl_2_num.head()

Unnamed: 0,rating,unixReviewTime,price,outOf,itemTotReviews,avgUserHelpfulness,itemRatedReviews,category_Women,summaryTextNumWords
43,5.0,1231632000,53.04,21,4,0.891534,2,0.0,4
53,3.0,1330992000,-1.0,25,12,0.970149,11,1.0,1
56,2.0,1390521600,-1.0,23,18,0.909091,8,1.0,6
80,4.0,1260835200,39.48,51,10,0.882353,3,0.0,21
136,3.0,1388016000,-1.0,40,12,0.98913,10,1.0,5


In [38]:
print("final features for model fit to low 'outOf' values:")
for c in X_train_abl_1_num.columns:
    print(c)

final features for model fit to low 'outOf' values:
rating
price
outOf
userTotReviews
itemTotReviews
avgItemHelpfulness
userRatedReviews
category_Women
category_Petite
reviewTextNumSents
reviewTextFscore
reviewTextFKscore


In [39]:
print("final features for model fit to high 'outOf' values:")
for c in X_train_abl_2_num.columns:
    print(c)

final features for model fit to high 'outOf' values:
rating
unixReviewTime
price
outOf
itemTotReviews
avgUserHelpfulness
itemRatedReviews
category_Women
summaryTextNumWords
