In [25]:
import string
from os import listdir
from os.path import isfile, join

import nltk
from nltk import word_tokenize
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [26]:
nltk.download('opinion_lexicon')
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dylanedwards/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
punctuation = string.punctuation
positive_dict = set(opinion_lexicon.positive())
negative_dict = set(opinion_lexicon.negative())
positive_dict_stemmed = [PorterStemmer().stem(word) for word in positive_dict]
negative_dict_stemmed = [PorterStemmer().stem(word) for word in negative_dict]

def get_files_from_dir(directory):
    return [f for f in listdir(directory) if isfile(join(directory, f))]

def process_string_sentence(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        stemmed = [PorterStemmer().stem(word) for word in removed]
        stemmed_sentence = " ".join(stemmed)
        return stemmed_sentence

def process_string(text):
        englishStopwords = stopwords.words("english")  # non-neccesary words
        text = text.lower()  # case folding
        # remove punctuation
        text = "".join([char for char in text if char not in punctuation])
        words = word_tokenize(text)
        removed = [word for word in words if word not in englishStopwords]
        return " ".join(removed)


def tokenize_files(files, dir):
        cleaned_positive_files = []
        for file in files:
            file_path = str.format("{}/{}", dir, file)
            with open(file_path) as f:
                raw_text = f.read()
                cleaned_positive_files.append(process_string(raw_text))
        return cleaned_positive_files

def is_word_positive(word):
        if word in positive_dict or word in positive_dict_stemmed:
            return True
        return False

def is_word_negative(word):
    if word in negative_dict or word in negative_dict_stemmed:
        return True
    return False



def get_word_occurrences(tokenized_files):
        word_occurrences = {}
        word_occurrences["positive"] = 0
        word_occurrences["negative"] = 0
        total_num_words = 0
        for file in tokenized_files:
            # calc number exclams
            # calc number pos/neg/words
            for word in file:
                if is_word_positive(word):
                    word_occurrences["positive"] += 1
                if is_word_negative(word):
                    word_occurrences["negative"] += 1
                if word not in word_occurrences:
                    word_occurrences[word] = 0
                word_occurrences[word] += 1
                total_num_words += 1
        return word_occurrences, total_num_words

def get_raw_text_from_files(files: list, dir: str) -> list:
    raw_text = []
    for file in files:
        file_path = str.format("{}/{}", dir, file)
        with open(file_path) as f:
            file_text_in_lines = f.read()
            raw_text.append(file_text_in_lines)
    return raw_text

In [75]:
import numpy as np
neg_data = np.array(tokenize_files(get_files_from_dir("./data/neg"), "data/neg"))
pos_data = np.array(tokenize_files(get_files_from_dir("./data/pos"), "data/pos"))
allData = np.concatenate((neg_data, pos_data))
print(allData)
# making labels for the data, the first
neg_labels = np.fromiter([0 for i in range(len(neg_data))], int)  # create negative labels
pos_labels = np.fromiter([1 for i in range(len(pos_data))], int)  # create positive labels
allLabels = np.concatenate((neg_labels, pos_labels))

['ordered hp4705 bluetooth keyboard sent two nonbluetooth item send back hope get credit'
 'hoping buy small handheld vacuum office computer hardware keyboads air inlets however datavac little power even full charge next useless cant recommend'
 'bought 3 vtech phones house since begining battery problems really annoying make phone call phone doesnt work battery point time least one phones constantly reports low charge battery message means cant use phone course hear ring phone never room run pick phone since one location battery problem reviewers noted spite fact make minimal use phones leave charger time beginning used take battery phone put back doesnt quite work anymore tried get support cant get live person vtech number website suggest replacing batteries rechargeable many times really supposed buy new rechargeable baterries already replaced know wont buy vtech products'
 ...
 'bought folks christmas took time hook making dvds simple everything recording tv recording movies shows 

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
print(allData[0:2])
bow_converter = CountVectorizer(tokenizer=lambda doc: doc)
y = bow_converter.fit_transform(allData[0:2])
# bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2])
# tfidf_transform = text.TfidfTransformer(norm=None)
# X_tfidf = tfidf_transform.fit_transform(X_bow)
print(bow_converter.get_stop_words())
words = bow_converter.get_feature_names()
print(len(words))
print(words)
print(len(bow_converter.vocabulary_))

['ordered hp4705 bluetooth keyboard sent two nonbluetooth item send back hope get credit'
 'hoping buy small handheld vacuum office computer hardware keyboads air inlets however datavac little power even full charge next useless cant recommend']
None
28
[' ', '0', '4', '5', '7', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
28




In [None]:
from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(trainingData, trainingLabels, testingData, testingLabels, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(trainingData, trainingLabels)
    score = model.score(testingData, testingLabels)
    print('Test Score with', description, 'features', score)
    return model

In [None]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.linear_model import LogisticRegression

kf = KFold(n_splits=5, shuffle=True)
uSIF_model = uSIF(glove, workers=2, lang_freq="en")
#Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings model.


for trainingIndex, testingIndex in kf.split(allData):
    trainingData, testingData = allData[trainingIndex], allData[testingIndex]
    trainingLabels, testingLabels = allLabels[trainingIndex], allLabels[testingIndex]

    print("trainingLabels:", trainingLabels)
    # train naive bayes model
    # gnb = GaussianNB()
    # gnb.fit(train_embed, trainingLabels)
    model = LogisticRegression(C=_C).fit(trainingData, trainingLabels)
    score = model.score(testingData, testingLabels)

    labelPrediction = gnb.predict_proba(test_embed)[:, 1]
    print("label pred", labelPrediction)

    precision, recall, thresholds = precision_recall_curve(testingLabels, labelPrediction)
    average_precision = average_precision_score(testingLabels, labelPrediction)

    disp = plot_precision_recall_curve(gnb, test_embed, testingLabels)
    disp.ax_.set_title('2-class Precision-Recall curve: '
                       'AP={0:0.2f}'.format(average_precision))
