In [3]:
import pandas as pd
from operator import itemgetter
import string
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score,classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import bigrams
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from tpot import TPOTClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler



vader = SentimentIntensityAnalyzer()



# coding=UTF-8

from nltk.corpus import brown

# This is a fast and simple noun phrase extractor (based on NLTK)
# Feel free to use it, just keep a link back to this post
# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
# Create by Shlomi Babluki
# May, 2013

import nltk
# This is our fast Part of Speech tagger
#############################################################################
brown_train = brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'(-|:|;)$', ':'),
     (r'\'*$', 'MD'),
     (r'(The|the|A|a|An|an)$', 'AT'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ness$', 'NN'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.*', 'NN')
])
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
#############################################################################


# This is our semi-CFG; Extend it according to your own needs
#############################################################################
cfg = {}
cfg["NNP+NNP"] = "NNP"
cfg["NN+NN"] = "NNI"
cfg["NNI+NN"] = "NNI"
cfg["JJ+JJ"] = "JJ"
cfg["JJ+NN"] = "NNI"
#############################################################################





def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg


class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(stopwords) if stopwords else set(sw.words('english'))
        self.punct      = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)




def show_most_informative_features(model, text=None, n=20):
    """
    Accepts a Pipeline with a classifer and a TfidfVectorizer and computes
    the n most informative features of the model. If text is given, then will
    compute the most informative features for classifying that text.
    Note that this function will only work on linear models with coefs_
    """
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(model.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)




def clean_text(text):
    text = text.replace("<br />", " ")
    text = text.decode("utf-8")

    return text




def edgemap(input):

    if float(input) > 0.5:
        sentiment = 1
    else:
        sentiment = 0

    return pd.Series(dict(sentiment=sentiment))


def VADERizer(input):
    score = vader.polarity_scores(str(input))
    compscore = score['compound']


    #return pd.Series(dict(vader=1)) if compscore > 0.1 else pd.Series(dict(vader=0))
    return pd.Series(dict(vader=1)) if (score['pos'] - 0.15 > score['neg']) else pd.Series(dict(vader=0))



df = pd.read_csv('ST.csv',encoding = 'latin1')

#delete empty "-" lines
df = df[df['Text'] != "-"]


# map from -1 to 1  -> 0 to 1
df["valence"] = ((df["Positive"] - df["Negative"])+1)/2.0





# print (df.shape) # (25000, 3)
# print (df["Text"][0])         # Check out the review
# print (df["valence"][0])          # Check out the sentiment (0/1)
#





from sklearn.metrics import confusion_matrix

df['sentiment'] = df['valence'].apply(lambda x:edgemap(x))
df['vadersenti'] = df['Text'].apply(lambda x:VADERizer(x))



print(df)



print ("accuracyScore:",accuracy_score(df['sentiment'], df['vadersenti']))
print (classification_report(df['sentiment'], df['vadersenti']))
print(df[['sentiment','vadersenti']].corr(method='kendall'))
print(confusion_matrix(df['sentiment'],df['vadersenti']))





from sklearn.feature_extraction.text import TfidfTransformer

experimenting_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 2),
                                   #tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   tokenizer=lambda text: mark_negation(word_tokenize(text)),
                                   #preprocessor=lambda text: text.replace("<br />", " "),
                                   max_features=10000)),

    ('classifier', SGDClassifier())

])




unigram_NLTK_clf = Pipeline([


    ('preprocessor', NLTKPreprocessor()),

    ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),


    ('tfidf', TfidfTransformer()),



    ('classifier', ExtraTreesClassifier(bootstrap=False,criterion='gini',max_features=0.75,min_samples_split=13))

])


NeuralNetwork_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 2),
                                   #tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   tokenizer=lambda text: mark_negation(word_tokenize(text)),
                                   #preprocessor=lambda text: text.replace("<br />", " "),
                                   max_features=10000)),

    ('classifier', MLPClassifier(learning_rate_init=0.01,
                    hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,
                    early_stopping=True, validation_fraction=0.05, alpha=1e-10)

     )

])


from sklearn.model_selection import train_test_split

X=df['Text'].values.astype('U')
y=df["sentiment"].values

train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.1, random_state=0)
print(unigram_NLTK_clf.fit(train_X, train_y))
print(unigram_NLTK_clf.score(test_X, test_y))



# ----------------NLTK unigram Extra-Trees ----------------------




from sklearn.model_selection import cross_val_score
scores = cross_val_score(unigram_NLTK_clf, X, y, cv=10)

print("-----------------NLTK unigram Extra-Trees -------------------------")
print(scores)
print(np.mean(scores))






print(experimenting_clf.fit(train_X, train_y))
print(experimenting_clf.score(test_X, test_y))






print("------------unigram-bigram-SGD  Classifier ----------------------------")
scores = cross_val_score(experimenting_clf, X, y, cv=10)
print (scores)
print (np.mean(scores))






#------------------- NeuralNetwork Classifier ----------------------#

print(NeuralNetwork_clf.fit(train_X, train_y))
print(NeuralNetwork_clf.score(test_X, test_y))






print("---------- NeuralNetwork Classifier ---------- ")
scores = cross_val_score(NeuralNetwork_clf, X, y, cv=10)
print (scores)
print (np.mean(scores))




import eli5

eli5.show_weights(NeuralNetwork_clf, top=10)
eli5.show_weights(experimenting_clf, top=10)

     Positive  Negative                                               Text  \
0         0.0       0.7  That our role on the x will become defunct and...   
1         0.7       0.1  that the x unit will still have the positive a...   
4         0.0       0.6  Lack of respect for staff from the x. It is th...   
5         0.7       0.1  best x team I've seen in 30 years of working a...   
6         0.0       0.8  The xs are to disconnected from many aspects o...   
7         0.0       0.7  One of the speakers "picking" on a colleague (...   
9         0.0       0.3  That progressing the business overtakes being ...   
10        0.0       0.3  learning a not being particularly effective wh...   
12        0.0       0.6  I am concerned that putting the survey online ...   
14        0.6       0.6  That we will make experts in legacy systems re...   
15        0.5       0.5  My biggest concern in X is the escalation proc...   
17        0.0       0.5  That there may not be the promised foll



-----------------NLTK unigram Extra-Trees -------------------------
[ 0.75925926  0.85185185  0.90384615  0.86538462  0.82692308  0.82692308
  0.5         0.71153846  0.63461538  0.65384615]
0.753418803419
Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])
0.792452830189
------------unigram-bigram-SGD  Classifier ----------------------------
[ 0.72222222  0.75925926  0.86538462  0.90384615  0.96153846  0.84615385
  0.65384615  0.86538462  0.84615385  0.78846154]
0.821225071225
Iteration 1, loss = 0.61012122
Validation score: 0.541667
Iteration 2, loss = 0.27456970
Validation score: 0.541667
Iteration 3, loss = 0.15869743
Validati

ModuleNotFoundError: No module named 'eli5'

In [4]:
import eli5

eli5.show_weights(NeuralNetwork_clf, top=10)
eli5.show_weights(experimenting_clf, top=10)

Weight?,Feature
+29.815,their
+26.834,very
+26.834,great
+26.834,information
+23.852,about
… 2201 more positive …,… 2201 more positive …
… 2618 more negative …,… 2618 more negative …
-20.871,over
-23.852,like
-23.852,staff
