In [1]:
import gzip 
from collections import defaultdict
import string 
import random 
from sklearn import linear_model

In [4]:
path = 'amazon_reviews_us_Gift_Card_v1_00.tsv.gz'

In [5]:
file = gzip.open(path, 'rt', encoding='utf8')

In [6]:
header = file.readline()

In [7]:
header = header.strip().split('\t')

In [8]:
dataset = []

In [10]:
for line in file:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [11]:
word_count = defaultdict(int)

In [12]:
punctuation = set(string.punctuation)

In [13]:
for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        word_count[w] +=1
counts = [(word_count[w], w) for w in word_count]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:100]]
word_id = dict(zip(words, range(len(words))))
word_set = set(words)

In [14]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum['review_body'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[word_id[w]] += 1
    feat.append(1)
    return feat    

In [15]:
random.shuffle(dataset)

In [16]:
X = [feature(d) for d in dataset]

In [17]:
y = [d['star_rating'] for d in dataset]

In [34]:
N = len(X)
X_train = X[:N//2] # splitting the dataset by 50%, 25% and 25%
X_valid = X[N//2:3*N//4]
X_test = X[3*N//4:]
y_train = y[:N//2]
y_valid = y[N//2:3*N//4]
y_test = y[3*N//4:]

In [35]:
len(X), len(X_train), len(X_valid), len(X_test)

(149086, 74543, 37271, 37272)

In [36]:
def MSE(model, X, y):
    predictions = model.predict(X)
    differences = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(differences)/len(differences)

In [37]:
#train the model for a range of regularization parameters
best_model = None
best_MSE = None

In [38]:
for lamb in [0.01, 0.1, 1, 10, 100]:
    model = linear_model.Ridge(lamb, fit_intercept=False)
    model.fit(X_train, y_train)
    
    mse_train = MSE(model, X_train, y_train)
    mse_valid = MSE(model, X_valid, y_valid)
    
    print('lambda = ' + str(lamb) + ' training---validation error = '+str(mse_train) + '---' + str(mse_valid))
    if not best_model or mse_valid < best_MSE:
        best_model = model
        best_MSE = mse_valid

lambda = 0.01 training---validation error = 0.5301094588998126---0.5418490681191773
lambda = 0.1 training---validation error = 0.5301094590024961---0.5418488082281372
lambda = 1 training---validation error = 0.5301094692676435---0.5418462189102249
lambda = 10 training---validation error = 0.5301104943077638---0.541821283478632
lambda = 100 training---validation error = 0.5302115624881009---0.5416662305952524


In [40]:
mse_test = MSE(best_model, X_test, y_test)
print('test error = '+ str(mse_test))

test error = 0.524367932606683
