In [1]:
import gzip
from collections import defaultdict
import string
import random
from nltk.stem.porter import PorterStemmer
import numpy

In [2]:
path = r"C:\Users\TaMeR\Desktop\Data SCIENCE\coursera\Recomendation System\amazon_reviews_us_Gift_Card_v1_00.tsv.gz"

In [3]:
f = gzip.open(path, "rt", encoding="utf8")

In [4]:
f

<_io.TextIOWrapper name='C:\\Users\\TaMeR\\Desktop\\Data SCIENCE\\coursera\\Recomendation System\\amazon_reviews_us_Gift_Card_v1_00.tsv.gz' encoding='utf8'>

In [5]:
header = f.readline()

In [6]:
header = header.strip().split("\t")

In [7]:
dataset = []
for line in f:
    fields = line.strip().split("\t")
    d = dict(zip(header, fields))
    d["star_rating"] = int(d["star_rating"])
    d["helpful_votes"] = int(d["helpful_votes"])
    d["total_votes"] = int(d["total_votes"])
    dataset.append(d)

In [8]:
# How many words?
wordCount = defaultdict(int)
for d in dataset:
    for w in d["review_body"].split():
        wordCount[w] += 1
print (len(wordCount))

97289


In [9]:
# What if we ignore capitalization and punctuation?
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
print (len(wordCount))

46283


In [10]:
#Stemming
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in dataset:
    r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1
        
print (len(wordCount))
    

37480


In [11]:
#what are the most popular 1000 words? Sorting words by popularity and keeping first 1000
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]] 
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [12]:
counts[10]

(48064, 'my')

In [13]:
#Extracting features from the most popular words
def feature(datum):
    feat = [0]*len(words)
    r = "".join([c for c in datum["review_body"].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] +=1 #adding each time we see that word in text
    feat.append(1) #Appending the offset feature to the end
    return feat

In [14]:
# WE can examine which words have most positive/negative sentiment by looking at their coef.
random.shuffle(dataset)

In [15]:
X = [feature(d) for d in dataset]
y = [d["star_rating"] for d in dataset]

In [16]:
from sklearn import linear_model
model = linear_model.Ridge(1.0, fit_intercept=False) #1.0 regularization strength like lambda
model.fit(X,y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
theta = model.coef_

In [18]:
wordWeights = list(zip(theta,words + ["offset"]))

In [19]:
wordWeights.sort()

In [20]:
wordWeights[:10]

[(-1.2822495123210373, 'reciev'),
 (-1.1530865104813663, 'guarante'),
 (-1.012026834981086, 'avail'),
 (-0.6884807929713764, 'cancel'),
 (-0.6325854621975856, 'twice'),
 (-0.5995169256896313, 'australia'),
 (-0.5945297034564829, 'fix'),
 (-0.558526274628698, 'wont'),
 (-0.5376681254518147, 'cannot'),
 (-0.5219196565515758, 'never')]

In [21]:
wordWeights[-10:]

[(0.37175886768049043, 'leav'),
 (0.38195008158508653, 'impress'),
 (0.4149658112989477, 'parti'),
 (0.4279194329325023, 'rang'),
 (0.4589016203407775, 'oneday'),
 (0.6414192046902523, 'excel'),
 (0.7239784579814638, 'serv'),
 (0.7747363294678746, 'attract'),
 (0.901333385975219, 'instruct'),
 (4.740065953935519, 'offset')]

In [22]:
predictions = model.predict(X)
differences = [(x-y)**2 for (x,y) in zip(predictions,y)]

In [23]:
MSE = sum(differences)/len(differences)
print ("MSE = " + str(MSE))

MSE = 0.4673208770748789


In [24]:
FVU = MSE / numpy.var(y)
R2 = 1-FVU
print ("R2 = " + str(R2))

R2 = 0.3205023016877865


In [25]:
y_class = [(rating > 3) for rating in y] #True or False yaptık, 3den büyükse True

In [26]:
model = linear_model.LogisticRegression()
model.fit(X,y_class)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
predictions = model.predict(X)

In [28]:
correct = predictions == y_class #list of True/False 

In [29]:
accuracy = sum(correct)/len(correct)
print ("Accuracy = " + str(accuracy))

Accuracy = 0.9549857129442068


In [30]:
TP = sum([(p and l) for (p,l) in zip(predictions,y_class)])
FP = sum([(p and not l) for (p,l) in zip(predictions,y_class)])
TN = sum([(not p and not l) for (p,l) in zip(predictions,y_class)])
FN = sum([(not p and l) for (p,l) in zip(predictions,y_class)])

In [31]:
print ( "TP = " + str(TP))
print ( "FP = " + str(FP))
print ( "TN = " + str(TN))
print ( "FN = " + str(FN))

TP = 138419
FP = 5562
TN = 3956
FN = 1149


In [32]:
(TP+TN)/(TP+FP+TN+FN)

0.9549857129442068

In [33]:
TPR = TP /(TP+FN) #True Pozitive Rate
TNR = TN /(TN+FP)

In [34]:
BER = 1-1/2*(TPR+TNR)
print ("Balanced error rate = " + str(BER))

Balanced error rate = 0.2962995048425716


In [35]:
precision = TP / (TP + FP)

In [36]:
recall = TP / (TP + FN)

In [38]:
F1 = 2*(precision*recall) / (precision + recall)

In [39]:
F1

0.9763321330704745

In [40]:
confidences = model.decision_function(X)

In [41]:
confidences

array([6.63807111, 6.39024595, 7.05738601, ..., 3.77042612, 1.31410576,
       3.42196401])

In [42]:
confidencesAndLabels = list(zip(confidences,y_class))

In [43]:
confidencesAndLabels

[(6.638071111285742, True),
 (6.390245946240506, True),
 (7.057386014078799, True),
 (6.360881543506584, True),
 (4.448413606990894, True),
 (-7.383782620160887, False),
 (5.637245037938867, True),
 (2.7713220560379614, True),
 (6.215662971752723, True),
 (2.7226627511162347, True),
 (2.911277117801889, True),
 (4.4651454329091935, True),
 (3.7534245427842796, True),
 (2.911277117801889, True),
 (3.541858133539664, True),
 (2.899774962057265, True),
 (4.255298569919799, True),
 (2.5692426259940966, False),
 (-1.6757374628578445, False),
 (5.7329288830923355, True),
 (8.05919458002211, True),
 (8.923951582695906, True),
 (5.3868373843728845, True),
 (4.65683935609088, True),
 (4.028921888053484, True),
 (7.24455892292921, True),
 (5.311516562013387, True),
 (3.253509310763153, True),
 (2.911277117801889, True),
 (2.195089375601846, True),
 (4.077477651405506, True),
 (4.634786176249816, True),
 (4.3568901752970755, True),
 (3.540763077976804, True),
 (4.701541227736509, True),
 (3.27967

In [44]:
labelsRankedByConfidence = [z[1] for z in confidencesAndLabels]

In [45]:
labelsRankedByConfidence

[True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 Tru

In [46]:
def precisionAtK(K,y_sorted):
    return sum(y_sorted[:K])/K

In [47]:
def recallAtK(K,y_sorted):
    return sum(y_sorted[:K]) / sum(y_sorted)

In [48]:
precisionAtK(50, labelsRankedByConfidence)

0.92

In [49]:
precisionAtK(1000, labelsRankedByConfidence)

0.941

In [50]:
precisionAtK(10000, labelsRankedByConfidence)

0.935