In [70]:
import pandas as pd
from operator import itemgetter
import string
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score,classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import bigrams
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler


In [71]:
import re
def preprocess(text):
    # convert to ASCII

    # if the input is HTML, force-add full stops after these tags
    fullStopTags = ['li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd']
    for tag in fullStopTags:
        text = re.sub(r'</'+tag+'>', '.', text)
    text = re.sub(r'<[^>]+>', '', text)                  # strip out HTML
    text = re.sub(r'[,:;()\-]', ' ', text)               # replace commas, hyphens etc (count as spaces)
    text = re.sub(r'[\.!?]', '.', text)                  # unify terminators
    text = re.sub(r'^\s+', '', text)                     # strip leading whitespace
    text = re.sub(r'[ ]*(\n|\r\n|\r)[ ]*', ' ', text)    # replace new lines with spaces
    text = re.sub(r'([\.])[\. ]+', '.', text)            # check for duplicated terminators
    text = re.sub(r'[ ]*([\.])', '. ', text)             # pad sentence terminators
    text = re.sub(r'\s+', ' ', text)                     # remove multiple spaces
    text = re.sub(r'\s+$', '', text);                    # strip trailing whitespace
    return text



In [97]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg


class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    """
    Transforms input data by using NLTK tokenization, lemmatization, and
    other normalization and filtering techniques.
    """

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(stopwords) if stopwords else set(sw.words('english'))
        self.punct      = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        """
        Fit simply returns self, no other information is needed.
        """
        return self

    def inverse_transform(self, X):
        """
        No inverse transformation
        """
        return X

    def transform(self, X):
        """
        Actually runs the preprocessing on each document.
        """
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)



In [98]:



def clean_text(text):
    text = text.replace("<br />", " ")
    text = text.decode("utf-8")

    return text




def edgemap(input):

    if float(input) > 0.5:
        sentiment = 1
    else:
        sentiment = 0

    return pd.Series(dict(sentiment=sentiment))


def VADERizer(input):
    score = vader.polarity_scores(str(input))
    compscore = score['compound']


    #return pd.Series(dict(vader=1)) if compscore > 0.1 else pd.Series(dict(vader=0))
    return pd.Series(dict(vader=1)) if (score['pos'] - 0.15 > score['neg']) else pd.Series(dict(vader=0))



In [74]:
NeuralNetwork_clf = Pipeline([


    ('vectorizer', CountVectorizer(analyzer="word",
                                   ngram_range=(1, 3),
                                   tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   #tokenizer=lambda text: mark_negation(word_tokenize(text)),
                                   #preprocessor=lambda text: text.replace("<br />", " "),
                                   max_features=25000)),

    ('classifier', MLPClassifier(learning_rate_init=0.01,
                    hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,
                    early_stopping=True, validation_fraction=0.05, alpha=1e-10)

     )

])

In [75]:
df1 = pd.read_csv('ST.csv',encoding = 'latin1')
df1 = df1.dropna()
df2 = pd.read_csv('Apr17stafftalks.csv',encoding = 'latin1')
df2 = df2[df2["Text"] != '']
df2 = df2.dropna()
df3 = pd.read_csv('July17stafftalks.csv',encoding = 'latin1')
df = pd.concat([df1, df2,df3], ignore_index=True)
print(df.shape)
#delete empty "-" lines
df = df[df['Text'] != "-"]
df = df.dropna()
# map from -1 to 1  -> 0 to 1
df["valence"] = ((df["Positive"] - df["Negative"])+1)/2.0

df["sentiment"] = df["valence"].apply(lambda x:edgemap(x))

df=df.drop(['Positive','Negative','valence'], axis=1)
df['flag'] = 0
df.tail(12)

(1754, 3)


Unnamed: 0,Text,sentiment,flag
1723,Thank you for being open and for taking the ti...,1,0
1726,Really useful to hear everyone's questions. Ho...,1,0
1729,Lots of transformations all over the place. I ...,0,0
1730,Exciting times ahead,1,0
1733,I mentioned to X that it would be if we could ...,0,0
1736,Thanks to all for an interesting session,1,0
1737,Keep these staff talks going,1,0
1741,Presentations and timings all good.,1,0
1745,An inspiring talk from all speakers.,1,0
1749,All four were great speakers - it was great to...,1,0


In [80]:
ldf1 = pd.read_csv('labeledTrainData.tsv',encoding = 'latin1',sep="\t")

In [81]:
ldf1=ldf1.drop('id', axis=1)
list(ldf1)
ldf1.columns=['sentiment','Text']
ldf1['flag']=1
ldf1.tail(12)

Unnamed: 0,sentiment,Text,flag
24988,1,While originally reluctant to jump on the band...,1
24989,1,I heard about this movie when watching VH1's \...,1
24990,1,I've never been huge on IMAX films. They're co...,1
24991,0,Steve McQueen has certainly a lot of loyal fan...,1
24992,0,Sometimes you wonder how some people get fundi...,1
24993,0,"I am a student of film, and have been for seve...",1
24994,0,"Unimaginably stupid, redundant and humiliating...",1
24995,0,It seems like more consideration has gone into...,1
24996,0,I don't believe they made this film. Completel...,1
24997,0,"Guy is a loser. Can't get girls, needs to buil...",1


In [83]:
fdf = pd.concat([df, ldf1], ignore_index=True)

In [84]:
fdf.tail()

Unnamed: 0,Text,flag,sentiment
26256,It seems like more consideration has gone into...,1,0
26257,I don't believe they made this film. Completel...,1,0
26258,"Guy is a loser. Can't get girls, needs to buil...",1,0
26259,This 30 minute documentary BuÃ±uel made in the...,1,0
26260,I saw this movie as a child and it broke my he...,1,1


In [93]:
from sklearn.model_selection import train_test_split


#X=fdf['Text'].values.astype('U')
#y=fdf["sentiment"].values

#train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.1, random_state=0)


train_X = fdf['Text'].values[500:]
test_X = fdf['Text'].values[:500]


train_y = fdf['sentiment'].values[500:]
test_y = fdf['sentiment'].values[:500]


from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [92]:
print("---------- NeuralNetwork Classifier ---------- ")
print(NeuralNetwork_clf.fit(train_X, train_y))
print(NeuralNetwork_clf.score(test_X, test_y))

scores = cross_val_score(NeuralNetwork_clf, X, y, cv=10)
print (scores)
print (np.mean(scores))


---------- NeuralNetwork Classifier ---------- 
Iteration 1, loss = 0.41797730
Validation score: 0.885758
Iteration 2, loss = 0.19172023
Validation score: 0.890327
Iteration 3, loss = 0.13063656
Validation score: 0.889566
Iteration 4, loss = 0.10091339
Validation score: 0.881950
Iteration 5, loss = 0.08508719
Validation score: 0.898705
Iteration 6, loss = 0.06193375
Validation score: 0.889566
Iteration 7, loss = 0.05354168
Validation score: 0.885758
Iteration 8, loss = 0.04259026
Validation score: 0.890327
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=25000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
      ...e=True, solver='adam', tol=0.0001, validation_fraction=0.05,
   

In [43]:
# 0.84 NOT BAD! LETS PICKLE THIS CLASSIFIER
from sklearn.externals import joblib
joblib.dump(NeuralNetwork_clf, 'StaffTalksCLF.pkl') 

['StaffTalksCLF.pkl']

In [109]:



vader = SentimentIntensityAnalyzer()

hmd=pd.read_csv('sdemo.csv',encoding = 'latin1')



Y_pred = NeuralNetwork_clf.predict(hmd.Text.values)

Y_pred_prob = NeuralNetwork_clf.predict_proba(hmd.Text.values)[:,1]
Pred_output_dataframe = pd.DataFrame({'Text':hmd.Text.values,'Y_pred':Y_pred,'Y_pred_prob':Y_pred_prob})

Pred_output_dataframe['Lexicon'] = Pred_output_dataframe.Text.apply(lambda x:VADERizer(x))

In [110]:
Pred_output_dataframe

Unnamed: 0,Text,Y_pred,Y_pred_prob,Lexicon
0,i dont think i like this stuff,0,0.476629,0
1,more change is needed,0,0.04546,0
2,under this law you are terminated,0,0.135778,0
3,This is great,1,0.988882,1
4,I like the fact that I learn a lot,1,0.78266,1
5,The ONS is the best,1,0.99522,1
