Andrew Marshall
INFS 770
04/19/2019
Assignment 4

In [None]:
# Built-in/Generic Imports
import os
import sys
#

# Libraries
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import gensim

#import matplotlib.pyplot as plt
#import mglearn.plots
#

# Modules
from nltk import word_tokenize 
from sklearn.cluster import KMeans
from gensim.models import LdaModel, LsiModel
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#from nltk.stem import WordNetLemmatizer 
#from sklearn import tree

#from sklearn.datasets import load_breast_cancer
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.datasets import make_blobs

#from sklearn.svm import SVC
#from sklearn.metrics import classification_report,confusion_matrix
#from sklearn.metrics import precision_score
#

#Suppress scientific notation
np.set_printoptions(suppress=True)



Q1

In [None]:
#import file for dataset
#reviews = open("amazon_reviews_texts.csv","r") 
reviews = pd.read_csv('amazon_review_texts.csv')
reviews.head()


In [None]:
reviews['score'].value_counts()

In [None]:
reviews['category'].value_counts()

Q2

In [None]:
# get a set of stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))

def before_token(documents):
    # conver words to lower case
    lower = map(str.lower, documents)
    # remove punctuations
    punctuationless = list(map(lambda x: " ".join(re.findall('\\b\\w\\w+\\b',x)), lower))
    # remove numbers
    return list(map(lambda x:re.sub('\\b[0-9]+\\b', '', x), punctuationless))

# initialize a stemmer
stemmer = nltk.stem.PorterStemmer()

# initialize a container of token frequencies
fdist = nltk.FreqDist()


# define a function that preprocess a single document and returns a list of tokens
def preprocess(doc):
    tokens = []
    for token in doc.split():
        if token not in stopwords:
            tokens.append(stemmer.stem(token))
    return tokens
            
# preprocess all documents
processed = list(map(preprocess, before_token(reviews['text'])))
print(processed[0])

In [None]:
# calculate the token frequency
# the FreqDist function takes in a list of tokens and return a dict containg unique tokens and frequency
fdist = nltk.FreqDist([token for doc in processed for token in doc])

In [None]:
print("Unique tokens: %d" % fdist.B())
print("Total tokens: %d" % fdist.N())
print("Tokens occurred only once: %d" % len(fdist.hapaxes()))

In [None]:
#Top 10 words
fdist.tabulate(10)

Based on the results above, the word "would" would not be as useful given that it has the lowest value.

Q3

In [None]:
processed_doc = list(map(" ".join, processed))

In [None]:
# vectorize
vectorizer = TfidfVectorizer(max_df=0.8,norm="l2",stop_words=stopwords)
#vectorizer = TfidfVectorizer(stop_words="english")
corpus_vect = vectorizer.fit_transform(processed_doc)
print(corpus_vect) # sparse matrix
df_vect = pd.DataFrame(corpus_vect.toarray(), columns=vectorizer.get_feature_names())
print(df_vect)

In [None]:
# examine the mapping of words to feature indexes
vectorizer.vocabulary_

Q4

In [None]:
print("n_samples: %d, n_features: %d" % corpus_vect.shape)

In [None]:
#Number of categories
categories = len(reviews['category'].drop_duplicates())

km = KMeans(n_clusters=categories, max_iter=100, random_state=42)
# km = KMeans(n_clusters=8, max_iter=100, random_state=54321)
km.fit(corpus_vect)

In [None]:
km.transform(corpus_vect)

In [None]:
km.predict(corpus_vect[0])

In [None]:
# examine the representative words for each cluster
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(categories):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print

Three of the clusters appear to be accurate, but Cluster 0 does not accurately describe the category "automotive". I believe the word "bed", which refers to a truck bed most likely led to the inclusion of "mattress" and "sleep", which means context is a factor.

Q5

In [None]:
# convert the vectorized data to a gensim corpus object
corpus = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)
id2word = dict((v,k) for k,v in vectorizer.vocabulary_.items())
print(id2word)

In [None]:
# build the lda model
lda = LdaModel(corpus, num_topics=4,id2word=id2word, passes=10)
print(lda.print_topics())

The words appear to describe their respective topics based on the assigned weights, though not as well as the clustering method did. 
Also, the LDA is prone to random results depending on which random seed is used. It also would require a larger sample size 
to generate more accurate results.

Q6

In [None]:
# 5-fold cross validation
skf = StratifiedKFold(n_splits=5)
fold = 0
for train_index, test_index in skf.split(np.array(processed_doc), reviews.category):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = np.array(processed_doc)[train_index], np.array(processed_doc)[test_index]
    train_y, test_y = reviews.category[train_index], reviews.category[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(max_df=0.8,min_df=2,stop_words=stopwords)
    X = vectorizer.fit_transform(train_x)
    print("Number of features: %d" % len(vectorizer.vocabulary_))
    X_test = vectorizer.transform(test_x)
     # train model
    clf = SGDClassifier(random_state=fold,max_iter = 1000, tol = 1e-3)
    clf.fit(X, train_y)
    # predict
    pred = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred).split("\n"):
        print(line)

Q7

In [None]:
scores= reviews['score']
sat = [4,5]
notsat = [1,2,3]

In [None]:
satisfaction = []
i = 0
while(i < len(scores)):
    if scores[i] in sat :
        satisfaction.append(1)
    elif scores[i] in notsat:
        satisfaction.append(0)
    i += 1


#satisfaction = np.array(satisfaction)   

In [None]:
sat_df = pd.DataFrame({'satisfaction':satisfaction})
sat_df
reviews["satisfaction"] = sat_df


In [None]:
# 5-fold cross validation
skf = StratifiedKFold(n_splits=5)
fold = 0
for train_index, test_index in skf.split(np.array(processed_doc), reviews.satisfaction):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = np.array(processed_doc)[train_index], np.array(processed_doc)[test_index]
    train_y, test_y = reviews.satisfaction[train_index], reviews.satisfaction[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(max_df=0.8,min_df=2,stop_words=stopwords)
    X = vectorizer.fit_transform(train_x)
    print("Number of features: %d" % len(vectorizer.vocabulary_))
    X_test = vectorizer.transform(test_x)
     # train model
    clf = SGDClassifier(random_state=fold,max_iter = 1000, tol = 1e-3)
    clf.fit(X, train_y)
    # predict
    pred = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred).split("\n"):
        print(line)

Q8

In [None]:
# read the lexicon
lexicon = dict()

# read postive words
with open("opinion-lexicon-English\\negative-words.txt", "r") as in_file:
    for line in in_file.readlines():
        if not line.startswith(";") and line != "\n":
            lexicon[line.strip()] = -1

# read negative words
with open("opinion-lexicon-English\\positive-words.txt", "r") as in_file:
    for line in in_file.readlines():
        if not line.startswith(";") and line != "\n":
            lexicon[line.strip()] = 1

# print the top 5 entries
for i, (k, v) in enumerate(lexicon.items()):
    print(k, v)
    if i > 4: break

In [None]:
vocabulary=lexicon.keys()

In [None]:
# 5-fold cross validation

skf = StratifiedKFold(n_splits=5)
fold = 0
for train_index, test_index in skf.split(np.array(processed_doc),reviews.satisfaction):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = np.array(processed_doc)[train_index], np.array(processed_doc)[test_index]
    train_y, test_y = reviews.satisfaction[train_index], reviews.satisfaction[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(max_df=0.8,min_df=2,stop_words=stopwords,vocabulary=vocabulary)
    X = vectorizer.fit_transform(train_x)
    print("Number of features: %d" % len(vectorizer.vocabulary_))
    X_test = vectorizer.transform(test_x)
     # train model
    clf = SGDClassifier(random_state=fold,max_iter = 1000, tol = 1e-3)
    clf.fit(X, train_y)
    # predict
    pred = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred).split("\n"):
        print(line)

The average F1-score has in fact decreased. Context can sometimes factor into how words are interpreted during the analysis.

Q9

In [None]:
#X_std = StandardScaler().fit_transform(X) # you need to do standardization, since pca is sensitive to the relative scaling of the original variables
vectorizer = TfidfVectorizer(max_df=0.8,min_df=2,stop_words=stopwords)
X = vectorizer.fit_transform(processed_doc).todense()
y = reviews.satisfaction
X = StandardScaler().fit_transform(X)
print(len(X[0]))
pca = PCA(svd_solver='randomized',whiten=True).fit(X)
print(pca.explained_variance_ratio_)
sumofvariance=0.0
n_components = 0
for item in pca.explained_variance_ratio_:
    sumofvariance += item
    n_components+=1
    if sumofvariance>=0.9:
        break
print(n_components)
X_train_pca = pca.transform(X)
train_x, test_x, train_y, test_y = train_test_split(X_train_pca,y,test_size=0.2, stratify=y, random_state=42)
pca = PCA(n_components=n_components, svd_solver='randomized',whiten=True).fit(train_x)
clf = SGDClassifier(random_state=fold,max_iter = 1000, tol = 1e-3)
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
for line in metrics.classification_report(test_y, pred_y).split("\n"):
    print(line)



PCA is known as Principcal Component Analysis; 
It reduces the number of features provided with data to what are known as Principal Components and then uses those Principal Components to make
its predictions. It is recommended that its independent variables be 
standardized because this method has a weakness again relative scaling of the original variables.

In [None]:
# 5-fold cross validation
X = pca.components_.transpose()
y = reviews.satisfaction
skf = StratifiedKFold(n_splits=5)
fold = 0
for train_index, test_index in skf.split(X,y):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    print("Number of features: %d" % n_components)
    # train model
    clf = SGDClassifier(random_state=fold,max_iter = 1000, tol = 1e-3)
    clf.fit(train_x,train_y)
    # predict
    pred = clf.predict(test_x)
    # classification results
    for line in metrics.classification_report(test_y, pred).split("\n"):
        print(line)

The average F1-score has in fact decreased after the cross-validation. 
A possible reason for this is that the latent concepts of components can not always be interpreted accurately.