In [None]:
import time, itertools, re, math, operator, os
from collections import Counter

class part5:
    tags = ["negative","neutral","positive","O"]
    
    def __init__(self,train_words,train_tags):
        self.long_train_words = [j for i in train_words for j in i]
        new_tags = self.convert_tag(train_tags)
        processed_train = self.get_processed_tweets(train_words)
        #print(processed_train)
        self.word_tag_count = self.get_word_tag_count(processed_train,new_tags)
        #print(self.word_tag_count)
        self.generate_entities(train_words,train_tags)
        tweet_sentiment = self.get_tweet_sentiment(new_tags)
        self.generate_bigrams(processed_train,tweet_sentiment)
        #print(len(self.word_tag_count))
        self.generate_word_class_count(train_words,tweet_sentiment)
        self.reduce_dict()
    #converting all tags to just their sentiments (ignoring "B-" and "I-") to accumulate count for classification purposes
    def convert_tag(self,train_tags):
        new_tag_list = []
        for tweet in train_tags:
            l = []
            for tag in tweet:
                if tag != "O":
                    l.append(tag[2:])
                else:
                    l.append("O")
            new_tag_list.append(l)
        return new_tag_list
    
    def remove_repeat(self,word):
        if len(word)>3:
            return re.sub(r'(.)\1{2,}', r'\1\1', word)
        else:
            return word
    
    def word_stem(self,word):        
        n = len(word)
        if n>4:
            if word[n-3:] == "ing" and word[:n-3] in self.long_train_words:
                #print("ing!")
                new_word = word[:n-3]          
                return new_word
            elif word[n-2:] == "ed" and word[:n-2] in self.long_train_words:
                #print("ed!")
                new_word = word[:n-2]
                return new_word
            elif word[n-1] == "s" and word[:n-1] in self.long_train_words:
                #print("s!")
                new_word = word[:n-1]
                return new_word
            else:
                return word
        else:
            return word
            
    #remove numbers, special characters, urls and converts 
    def get_processed_tweets(self,train_words):
        processed_tweets = []
        for tweet in train_words:
            p_tweet = []
            for word in tweet:
                
                if word in stop_list:
                    new_word = "*!__STOP__!*"
                    
                elif word in negate_list:
                    new_word = "*!__NEGATIVE__!"
                
                elif word == "rt":
                    new_word = "*!__RT__!*"
                elif (word[0] == "#" or word[0] == "@") and len(word)>1:
                    new_word = self.word_stem(self.remove_repeat(word[1:]))
                    
                elif word[0:7]=="http://":
                    new_word = "*!__URL__!*"
                    
                elif not word.isalnum():
                    new_word = "*!__SPECIAL__!*"
                    
                elif is_number(word):
                    new_word = "*!__NUM__!*"
                
                else:
                    new_word = self.word_stem(self.remove_repeat(word))

                p_tweet.append(new_word)
            processed_tweets.append(p_tweet)
        return processed_tweets
                
    def generate_entities(self,train_words,train_tags):
        for i in range(len(train_tags)):           
            phrase_list = []
            phrase_tag = ""
            for j in range(len(train_tags[i])-1):
                word = train_words[i][j]
                current_tag = train_tags[i][j]
                next_tag = train_tags[i][j+1] 
                
                if current_tag[0] == "B":
                    phrase_list.append(word)
                    phrase_tag = current_tag[2:]
                    
                elif current_tag[0] == "I":
                    phrase_list.append(word)
                    if next_tag == "O" or next_tag[0] == "B":
                        if len(phrase_list)>1:
                            phrase = " ".join(phrase_list)
                            if phrase in self.word_tag_count:
                                self.word_tag_count[phrase][phrase_tag] +=1
                            else:
                                self.word_tag_count[phrase] = {"negative":0, "neutral":0, "positive":0, "O":0}
                                self.word_tag_count[phrase][phrase_tag]+=1
                        phrase_list=[]
                        phrase_tag=""
                        
    def get_tweet_sentiment(self,new_tags):
        tweet_sentiment = []
        
        for tweet in new_tags:
            count_tag = {"negative":0, "neutral":0, "positive":0, "O":0}
            for tag in tweet:
                if tag != "O":
                    count_tag[tag]+=1
            
            
            sent = max(count_tag.items(), key=operator.itemgetter(1))[0]
            
            tweet_sentiment.append(sent)
        return tweet_sentiment
    
    def generate_bigrams(self,processed_words,tweet_sentiment):
        check_worthy = {}
        for i in range(len(processed_words)):
            for j in range(len(processed_words[i])-1):
                current_word = processed_words[i][j]
                next_word = processed_words[i][j-1]
                if current_word in negate_list:
                    if tweet_sentiment[i] == "positive":
                        self.word_tag_count[next_word]["negative"] +=1
                    elif tweet_sentiment[i] == "negative":
                        self.word_tag_count[next_word]["positive"] +=1
                        
                if current_word[:4]!="*!__" and next_word[:4]!="*!__":
                    phrase = current_word+" "+next_word

                    if phrase in check_worthy:
                        check_worthy[phrase][tweet_sentiment[i]] +=1
                    else:
                        check_worthy[phrase] = {"negative":0, "neutral":0, "positive":0, "O":0}                    
                        check_worthy[phrase][tweet_sentiment[i]] +=1
        for word in check_worthy:
            total = sum(check_worthy[word].values())
            if total>1:
                if word not in self.word_tag_count:                    
                    self.word_tag_count[word]=check_worthy[word]
                        
                                        
                    
    def get_word_tag_count(self,processed_words,new_tags):
        word_tag_count = {}
        for i in range(len(processed_words)):
            for j in range(len(processed_words[i])):
                word = processed_words[i][j]
                tag = new_tags[i][j]
                
                if word in word_tag_count:
                    word_tag_count[word][tag] +=1
                else:
                    word_tag_count[word] = {"negative":0, "neutral":0, "positive":0, "O":0}
                    word_tag_count[word][tag]+=1
        return word_tag_count
    
    def generate_word_class_count(self,train_words,tweet_sentiment):
        self.class_count = {"negative":0, "neutral":0, "positive":0, "O":0}
        self.word_class_count = {}
        for i in range(len(train_words)):
            tweet = train_words[i]
            sentiment = tweet_sentiment[i]
            count = Counter(tweet)
            self.class_count[sentiment] += sum(count.values())
            for word in count:
                if word not in self.word_class_count:
                    self.word_class_count[word] = {"negative":0, "neutral":0, "positive":0, "O":0}
                self.word_class_count[word][sentiment] += count[word]
        #print(self.word_class_count)
    def reduce_dict(self):
        for word in self.long_train_words:
            if word in self.word_class_count:
                total = sum(self.word_class_count[word].values())
                if total<3:
                    self.word_class_count.pop(word,None)
                    
    def score(self,sentiment,tweet):
        total_num = math.log(sum(self.class_count.values()))
        print(self.class_count)
        #print(self.word_class_count)
        sentiment_count = math.log(self.class_count[sentiment])
        #number of words with label c / total numer of words
        prob_sentiment = sentiment_count-total_num
        #product(of word(i) occurence how many are labelled c / number of words with label c)
        product = 0
        for word in tweet:    
            if word in self.word_class_count:
                word_occurence = self.word_class_count[word][sentiment]
                if word_occurence!=0:
                    word_occurence = math.log(word_occurence)
                else:
                    word_occurence = 0
                product += word_occurence-sentiment_count
            else:
                product += -sentiment_count
                
        return prob_sentiment+product
    
   
    def naive_bayes(self,test_data):
        sentiments = ["negative","neutral","positive"]
        total_sentiments = []
        p_tags=[]
        for tweet in test_data:
            tweet_sentiment = []
            predicted_tag = []
            prob = 0
            sentiments = ["negative","neutral","positive"]
            #prob of tweet being this tag
            score = 0
            for sentiment in sentiments:
                score+=self.score(sentiment,tweet)
            
            for p_sen in sentiments:
                #print(self.score(p_sen,tweet))
                prob_c = self.score(p_sen,tweet)/score
                tweet_sentiment.append((prob_c,p_sen))
            tweet_sent = (max(tweet_sentiment))[1]
            total_sentiments.append(tweet_sent)
        processed_test = self.get_processed_tweets(test_data)
        
        for i in range(len(processed_test)):
            tweet = processed_test[i]
            predict_tag = []
            for word in tweet:
                if word[:4] == "*!__":
                    predict_tag.append("O")
                elif word in self.word_tag_count:
                    maxim_tag = max(self.word_tag_count[word].items(), key=operator.itemgetter(1))[0]
                    predict_tag.append("B-"+maxim_tag)
                else:
                    predict_tag.append(total_sentiments[i])
            p_tags.append(predict_tag)
        return p_tags
                    
                
            
        
            
        
        
    
    