# Steps for sentiments

### Training
1. Message is process to good and bad manually
2. Train on whether the statement is good or bad


### Prod
1. naive bayes to classify message into good or bad
2. Use K-means to classify the words
    1. Use spiderweb method to determine amount of cluster
    2. Create wordcloud of the words provided

## Importing libraries

In [None]:
import pandas as pd
import numpy as np

Getting and 
Seperating the message for training

In [None]:
messages = pd.read_csv('ratingMessages', sep = '\t', names = ["rating", "message"])


messages = messages.sample(frac = 1, random_state = 0).reset_index(drop = True) ## shuffle and sample message


To split the message and rating into training, validation and testing

In [None]:
msgs = list(messages.message) 
lbls =list(messages.rating) 
trainingMsgs = msgs[:2500] 
valMsgs = msgs[2500:3500] 
testingMsgs = msgs[3500:]

trainingLbls = lbls[:2500] 
valLbls = lbls[2500:3500]
testingLbls = lbls[3500:]

Preparing for training

In [None]:
class NaiveBayesForGoodRating:
    def train (self, badMessages, goodMessages):
        self.words = set (' '.join (badMessages + goodMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (badMessages)) / (len (badMessages) + len (goodMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in badMessages if w in m])) / len (badMessages)
            prob2 = (1.0 + len ([m for m in goodMessages if w in m])) / len (goodMessages)
            self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
        self.likelihoods = np.array (self.likelihoods).T
        
    def predict (self, message):
        posteriors = np.copy (self.priors)
        for i, w in enumerate (self.words):
            if w in message.lower():  # convert to lower-case
                posteriors *= self.likelihoods[:,i]
            else:                                   
                posteriors *= np.ones (2) - self.likelihoods[:,i]
            posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
        if posteriors[0] > 0.5:
            return ['bad', posteriors[0]]
        return ['good', posteriors[1]]    

    def score (self, messages, labels):
        confusion = np.zeros(4).reshape (2,2)
        for m, l in zip (messages, labels):
            if self.predict(m)[0] == 'bad' and l == 'bad':
                confusion[0,0] += 1
            elif self.predict(m)[0] == 'bad' and l == 'good':
                confusion[0,1] += 1
            elif self.predict(m)[0] == 'good' and l == 'bad':
                confusion[1,0] += 1
            elif self.predict(m)[0] == 'good' and l == 'good':
                confusion[1,1] += 1
        return (confusion[0,0] + confusion[1,1]) / float (confusion.sum()), confusion

    def predict_prod(self,messages):
        result = []
        for message in messages:
            posteriors = np.copy (self.priors)
            for i, w in enumerate (self.words):
                if w in message.lower():  # convert to lower-case
                    posteriors *= self.likelihoods[:,i]
                else:                                   
                    posteriors *= np.ones (2) - self.likelihoods[:,i]
                posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
            if posteriors[0] > 0.5:
                result.append('bad')
            else:
                result.append('good')
        return result

In [None]:
badmsg = [m for (m, l) in zip(trainingMsgs, trainingLbls) if 'bad' in l]
goodmsg = [m for (m, l) in zip(trainingMsgs, trainingLbls) if 'good' in l]

In [None]:
clf = NaiveBayesForGoodRating()
clf.train(badmsg, goodmsg)

Testing of model

In [None]:
score, confusion = clf.score (valMsgs, valLbls)

In [None]:
print("The overall performance is:", score)
print("The confusion matrix is:\n", confusion)

# Prod

### Group messages using model