# Text Mining Classifiers

In [27]:
import pandas as pd
import numpy as np

## Prepare Data
### Using Amazon_Unlocked_Mobile dataset from kaggle.

In [2]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


### Remove null value rows and rating=3 to prepare data for sentiment analysis(positive or not)

In [5]:
df.dropna(inplace=True)   # Drop any rwos with null value
df = df[df['Rating'] != 3]   # Assume rating 3 is neutral and ignore considering that
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)  # Create column for positive sentiment
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0,0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0,0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0,0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0,1
11,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,This is a great product it came after two days...,0.0,1


In [6]:
df['Positively Rated'].mean()

0.748269374249846

#### It shows we have imbalanced classes.

## Model Selection
### Split the data to train and test set. Reviews assumed as input and Positively Rated is the target.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df['Reviews']
y = df['Positively Rated']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print('X_train first entery:\n\n', X_train[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entery:

 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


X_train shape:  (231202,)


### Vectorize data to use classifier

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vect = CountVectorizer().fit(X_train)

In [19]:
print('Vocab list contains: ', len(vect.get_feature_names_out()))
vect.get_feature_names_out()[::2000]     # Every 2000th feature of vocabluary

Vocab list contains:  53271


array(['00', '4h', 'adpaters', 'assembly', 'blasts', 'cashiers',
       'condidtion', 'debit', 'domestic', 'estimates', 'flawlessy',
       'gothrough', 'hui', 'irritating', 'lighting5', 'microcomputer',
       'nigeria', 'p7_l00', 'poorer', 'quirkyness', 'responsibility',
       'sens', 'sorrow', 'synch', 'trace', 'usvi', 'within3'],
      dtype=object)

In [22]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231202x53271 sparse matrix of type '<class 'numpy.int64'>'
	with 6113585 stored elements in Compressed Sparse Row format>

## Logistic Regression
### Using this model which is suitable for high dimentional data and binary classification.

In [23]:
from sklearn.linear_model import LogisticRegression

In [25]:
model = LogisticRegression(max_iter=1000).fit(X_train_vectorized, y_train)

## Model Evaluation
### Using roc area under the curve method to evaluate our model performance.

In [26]:
from sklearn.metrics import roc_auc_score

In [28]:
y_pred = model.predict(vect.transform(X_test))   # Predictions: using vectorized test data

print('AUC: ', roc_auc_score(y_test, y_pred))

AUC:  0.9303790943660453


In [29]:
feature_names = np.array(vect.get_feature_names_out())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))      # Connected to negative reviews
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))    # Connected to positive reviews

Smallest Coefs:
['mony' 'worst' 'false' 'horribly' 'dissatisfied' 'worthless'
 'unsatisfied' 'junk' 'blacklist' 'garbage']

Largest Coefs: 
['excelent' 'excelente' '4eeeks' 'efficient' 'excellent' 'loving'
 'exelente' 'lovely' 'loves' 'mn8k2ll']


## Part 2

## Tfidf Vectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vect = TfidfVectorizer(min_df=5).fit(X_train)   # Vocab list with words that appear at least 5 times in the document.
print('Tfidf vocab list contains: ', len(vect.get_feature_names_out()))

Tfidf vocab list contains:  18024


In [32]:
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

model = LogisticRegression(max_iter=1000).fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)

print('AUC: ', roc_auc_score(y_test, y_pred))

AUC:  0.9279682537578712


#### Smallest Tfidf: List of words which Commonly appeared across all reviews, or only appeared rarely in very long reviews.
#### Largest Tfidf: List of words which appeared frequently in a review, but did not appear commonly across all reviews.

In [37]:
feature_names = np.array(vect.get_feature_names_out())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest Tfidf:
['storageso' 'warmness' 'aggregration' 'commenter' 'pthalo' '1300' '34ghz'
 'bridging' 'srgb' 'seizing']

Largest Tfidf: 
['too' 'malo' 'true' 'bjvjjbkvjvj' 'satisfied' 'problems' 'satisfecho'
 'malfunction' 'satisfactory' 'horrible']


In [38]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'terrible' 'useless' 'waste' 'disappointed' 'poor' 'return'
 'horrible' 'returning']

Largest Coefs: 
['love' 'great' 'excellent' 'amazing' 'perfect' 'easy' 'perfectly'
 'awesome' 'loves' 'best']


#### Problems

In [39]:
X_pred = vect.transform(['not an issue, phone is working', 'an issue, phone is not working'])   # Sample input
print(model.predict(X_pred))

[0 0]


#### The first sentence is classified as negative! Using n-grams to handle it.
## n-grams

In [42]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

len(vect.get_feature_names_out())

200192

In [43]:
model = LogisticRegression(max_iter=1000).fit(X_train_vectorized, y_train)

y_pred = model.predict(X_test_vectorized)

print('AUC: ', roc_auc_score(y_test, y_pred))

AUC:  0.9674179850589075


#### With bigrams auc of the model is improved.

In [44]:
feature_names = np.array(vect.get_feature_names_out())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'worst' 'not good' 'horrible' 'terrible' 'not happy'
 'not very' 'looks ok' 'garbage']

Largest Coefs: 
['not bad' 'excelent' 'excelente' 'excellent' 'no problems' 'perfect'
 'no issues' 'exelente' 'awesome' 'amazing']


In [45]:
X_pred = vect.transform(['not an issue, phone is working', 'an issue, phone is not working'])   # Sample input
print(model.predict(X_pred))

[1 0]


#### The problem is solved!