In [1]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
#df = df.sample(frac=0.1, random_state=10)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [2]:
df = df.sample(frac=0.1, random_state=10)#we take 10% SAMPLE OF DATA TO SPEED UP THE COMPUTATION

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [3]:
df.dropna(inplace=True)
df=df[df['Rating']!=3]
df['Positively Rated']=np.where(df["Rating"]>3,1,0)
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.00,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.00,5,I fell in love with this phone because it did ...,0.0,1
...,...,...,...,...,...,...,...
30001,Apple iPhone 5c 32GB (Blue) - AT&T,Apple,274.95,5,What an upgrade compared to the iPhone 4. Goin...,7.0,1
313198,Samsung Galaxy Grand Prime DUOS G531H/DS - Gra...,Samsung,179.99,4,I liked it at first but is starting to lag alr...,0.0,1
138219,"BLU Studio 5.0 C HD Unlocked Cellphone, White",BLU,2000.00,4,very nice,0.0,1
66571,Apple iPhone 6s 64 GB International Warranty U...,Apple,689.95,1,It is not a new one. The tagboard on the box w...,0.0,0


In [4]:
# Most ratings are positive
df['Positively Rated'].mean()

0.7471776686078667

In [5]:
from sklearn.model_selection import train_test_split
X=df["Reviews"]#feature we take
y=df["Positively Rated"]#label we take

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

In [6]:
print('X_train first entry:', X_train.iloc[0])
print('X_train shape: ', X_train.shape)

X_train first entry: Everything about it is awesome!
X_train shape:  (23052,)


In [7]:
y_train.shape

(23052,)

In [8]:
#count vectorization involves counting the number of occurences each words appears in a document
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [9]:
vect.get_feature_names()[::2000] # Every 2000th feature.(:: this is used as a step counter start from 0 and then adding 2000 to it)

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

In [10]:
len(vect.get_feature_names())

19601

In [11]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
X_test_vectorized=vect.transform(X_test)

X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(X_test_vectorized)

print('AUC: ', roc_auc_score(y_test, predictions))#area under curve for checking the model performance

AUC:  0.8974332776669326


In [14]:
model.score(X_train_vectorized,y_train)

0.9736682283532883

In [15]:
model.score(X_test_vectorized,y_test)

0.93181522446324

In [16]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest

# Remember -ve indices mean the array is read backwards!

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))
#order of coeff match the order of input data
print(model.coef_[0].argsort())


Smallest Coefs:
['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
 'waste' 'disappointed']

Largest Coefs: 
['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']
[19319 17262 15968 ...  6690  6682  6681]


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# term frequency-inverse document frequency

# Features with high tfidf are usually used in specific types of documents but rarely used across all documents.
# Features with low tfidf are generally used across all documents in the corpus.

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
# Each token needs to appear in at least 5 documents to become a part of the vocabulary.
#ignore terms that appears less than 5 document
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

#count_vectorizer tokenize and add all words in dictionary with counts
#tf-idf vectorizer add the most appeared words in the vacabulary or dictionary

5442

In [18]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))



AUC:  0.889951006492175


In [19]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
#.to array return an array with same shape and containing the same data represented by the sparse matrix

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))


Smallest tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Largest tfidf: 
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']


In [20]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome'
 'far' 'perfectly']


In [21]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))
#so we use n-gram further to overcome this

[0 0]


In [22]:
#o means that's a negative rating 
#1 means positive rating

In [23]:
#n-gram is used in text mining to break words like there is a word fox the if n_gram=2 it gives fo and  ox 
#n-gram=2 is two words sequence like "please turn" and"turn your or "your homework"
#n-gram=3 means "please turn your" or "turn your homework"
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)
X_test_vectorized=vect.transform(X_test)

len(vect.get_feature_names())

29072

In [24]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(X_test_vectorized)

print('AUC: ', roc_auc_score(y_test, predictions))



AUC:  0.9110661794597458


In [25]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Largest Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']


In [26]:
# These reviews are now correctly identified
print(model.predict(['not an issue, phone is working',
                                    'an issue, phone is not working'])

[1 0]
