In [33]:

import string
from os import listdir
from os.path import isfile, join

import nltk
from nltk import word_tokenize
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [34]:
# import ssl
#
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

nltk.download('opinion_lexicon')
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
punctuation = string.punctuation.replace("!", "")
positive_dict = set(opinion_lexicon.positive())
negative_dict = set(opinion_lexicon.negative())
positive_dict_stemmed = [PorterStemmer().stem(word) for word in positive_dict]
negative_dict_stemmed = [PorterStemmer().stem(word) for word in negative_dict]

def get_files_from_dir(directory):
    return [f for f in listdir(directory) if isfile(join(directory, f))]

def process_string_sentence(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        stemmed = [PorterStemmer().stem(word) for word in removed]
        stemmed_sentence = " ".join(stemmed)
        return stemmed_sentence

def process_string(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        #stemmed = [PorterStemmer().stem(word) for word in removed]
        return removed


def tokenize_files(files, dir):
        cleaned_positive_files = []
        for file in files:
            file_path = str.format("{}/{}", dir, file)
            with open(file_path) as f:
                raw_text = f.read()
                cleaned_positive_files.append(process_string(raw_text))
        return cleaned_positive_files

def is_word_positive(word):
        if word in positive_dict or word in positive_dict_stemmed:
            return True
        return False

def is_word_negative(word):
    if word in negative_dict or word in negative_dict_stemmed:
        return True
    return False



def get_word_occurrences(tokenized_files):
        word_occurrences = {}
        word_occurrences["positive"] = 0
        word_occurrences["negative"] = 0
        total_num_words = 0
        for file in tokenized_files:
            # calc number exclams
            # calc number pos/neg/words
            for word in file:
                if is_word_positive(word):
                    word_occurrences["positive"] += 1
                if is_word_negative(word):
                    word_occurrences["negative"] += 1
                if word not in word_occurrences:
                    word_occurrences[word] = 0
                word_occurrences[word] += 1
                total_num_words += 1
        return word_occurrences, total_num_words

In [36]:
import numpy as np
neg_data = np.array(tokenize_files(get_files_from_dir("./data/neg"), "data/neg"))
pos_data = np.array(tokenize_files(get_files_from_dir("./data/pos"), "data/pos"))
allData = np.concatenate((neg_data, pos_data), axis=0)
print(allData)

  neg_data = np.array(tokenize_files(get_files_from_dir("./data/neg"), "data/neg"))


[list(['ordered', 'hp4705', 'bluetooth', 'keyboard', 'sent', 'two', 'nonbluetooth', 'item', 'send', 'back', 'hope', 'get', 'credit'])
 list(['hoping', 'buy', 'small', 'handheld', 'vacuum', 'office', 'computer', 'hardware', 'keyboads', 'air', 'inlets', 'however', 'datavac', 'little', 'power', 'even', 'full', 'charge', 'next', 'useless', 'cant', 'recommend'])
 list(['bought', '3', 'vtech', 'phones', 'house', 'since', 'begining', 'battery', 'problems', 'really', 'annoying', 'make', 'phone', 'call', 'phone', 'doesnt', 'work', 'battery', 'point', 'time', 'least', 'one', 'phones', 'constantly', 'reports', 'low', 'charge', 'battery', 'message', 'means', 'cant', 'use', 'phone', 'course', 'hear', 'ring', 'phone', 'never', 'room', 'run', 'pick', 'phone', 'since', 'one', 'location', 'battery', 'problem', 'reviewers', 'noted', 'spite', 'fact', 'make', 'minimal', 'use', 'phones', 'leave', 'charger', 'time', 'beginning', 'used', 'take', 'battery', 'phone', 'put', 'back', 'doesnt', 'quite', 'work', '

  pos_data = np.array(tokenize_files(get_files_from_dir("./data/pos"), "data/pos"))


In [37]:
allData[1999]

['fabulous',
 'product',
 'store',
 '700',
 'photos',
 '8',
 'mega',
 'pixel',
 'range',
 'happy',
 'purchased',
 'one',
 'trouble',
 'free']

In [38]:
# making labels for the data, the first
neg_labels = np.fromiter([0 for i in range(len(neg_data))], int)  # create negative labels
pos_labels = np.fromiter([1 for i in range(len(pos_data))], int)  # create positive labels
allLabels = np.concatenate((neg_labels, pos_labels), axis=0)

In [39]:

# Gensim is an open-source library for unsupervised topic modeling
import gensim
from gensim.models.keyedvectors import BaseKeyedVectors
import gensim.downloader as api
#  Fast Sentence Embeddings serves as an addition to Gensim.
#  it compute sentence vectors for large collections of sentences.
from fse import IndexedList
glove = api.load("glove-wiki-gigaword-100")
from fse.models import uSIF
#Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model.
# Performs a weighted averaging operation over all words in a sentences.
# After training, the model removes a number of weighted singular vectors.


In [40]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_precision_recall_curve

kf = KFold(n_splits=5, shuffle=True)
uSIF_model = uSIF(glove, workers=2, lang_freq="en")
#Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model.


for trainingIndex, testingIndex in kf.split(allData):
    trainingData, testingData = allData[trainingIndex], allData[testingIndex]
    trainingLabels, testingLabels = allLabels[trainingIndex], allLabels[testingIndex]
    # print("TRAIN:", trainingIndex, "TEST:", testingIndex)
    # print("TRAIN DATA :", trainingData, "TEST DATA:", testingData)
    # print("TRAIN LABELS :", trainingLabels, "TEST LABELS:", testingLabels)
    # for labels 0 is negative 1 is positive

    # train embeddings model
    uSIF_model.train(IndexedList(trainingData))

    train_embed = uSIF_model.infer(IndexedList(trainingData))  # Computed sentence vectors for training
    test_embed = uSIF_model.infer(IndexedList(testingData
                                         ))  # Computed sentence vectors for testing

    print("Train embed:", train_embed)
    print("trainingLabels:", trainingLabels)
    # train naive bayes model
    gnb = GaussianNB()
    gnb.fit(train_embed, trainingLabels)

    labelPrediction = gnb.predict_proba(test_embed)[:, 1]
    print("label pred", labelPrediction)

    precision, recall, thresholds = precision_recall_curve(testingLabels, labelPrediction)
    average_precision = average_precision_score(testingLabels, labelPrediction)

    disp = plot_precision_recall_curve(gnb, test_embed, testingLabels)
    disp.ax_.set_title('2-class Precision-Recall curve: '
                       'AP={0:0.2f}'.format(average_precision))



Train embed: [[-0.1727971   0.00765863  0.05834061 ...  0.06339264 -0.23722184
  -0.25862435]
 [-0.16835839 -0.1045339   0.02215827 ... -0.06613758  0.08829117
   0.01438841]
 [-0.26234218  0.07907388  0.00139689 ... -0.02093972  0.07506853
   0.05300527]
 ...
 [ 0.02842148  0.40536186  0.14771396 ...  0.03133157 -0.51821834
   0.34128636]
 [-0.24320625 -0.1545552   0.12896684 ... -0.28331715  0.35223293
   0.15178126]
 [ 0.2789461  -0.02265552  0.12501845 ... -0.21677262  0.29696578
  -0.10133533]]
trainingLabels: [0 0 0 ... 1 1 1]
label pred [5.44784706e-01 1.86273143e-04 1.98684556e-03 9.48845903e-01
 3.67932606e-04 7.45566550e-06 9.99978108e-01 6.62518137e-01
 2.31974547e-02 4.56233183e-01 6.61356702e-03 5.90240146e-01
 1.51373183e-02 3.49097698e-03 5.42560054e-04 9.85555204e-01
 8.81700093e-03 2.22595529e-02 2.84071783e-02 2.86804293e-05
 1.12683453e-03 9.99999982e-01 6.56754172e-02 2.22289882e-05
 2.33133746e-03 9.97272964e-01 9.98104090e-01 1.89545482e-05
 4.28026235e-01 7.91431