In [16]:
from typing import Dict, List, Tuple
import pandas as pd
import operator

def __load_training_data(train_data_file: str) -> Tuple[List[str], List[List[str]]]:
    train_df = pd.read_csv('training_data/sentences.csv', header = None)
    train_df.columns =['sentence', 'label']
    train_df = train_df.groupby('label')['sentence'].apply(list).reset_index(name='sentences')
    labels = train_df.label.to_list()
    sentence_sets = train_df.sentences.to_list()
    return labels, sentence_sets

def __word_count(sentence: str) -> List[Dict[str, int]]:
    counts = dict()
    words = sentence.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1     

    return counts

def build_simple_words_model(train_data_file: str) -> List[Dict[str,Dict[str, int]]]:
    labels, sentence_sets = __load_training_data(train_data_file = train_data_file)
    word_counts = [__word_count(' '.join(sentence_set).lower()) for sentence_set in sentence_sets]
    simple_word_model = ([{label : word_count} for label, word_count in zip(labels, word_counts)])
    return simple_word_model
    
def __generate_sentence_weights(sentence: str, simple_word_model: List[Dict[str,Dict[str, int]]]) -> Dict[str, Dict[str, int]]:
    words = sentence.lower().split()
    weights = dict()
    for word in words:
        word_weight = dict()
        for label_weights in simple_word_model:
            label = list(label_weights.keys())[0]
            label_dict = list(label_weights.values())[0]
            label_words = list(label_dict.keys())

            if word in label_words:
                word_weight[label] = label_dict[word]
            else:
                word_weight[label]  = 0

        weights[word] = word_weight
    return weights


def predict_label_new_sentence(sentence: str, simple_word_model: List[Dict[str,Dict[str, int]]]) -> Tuple[str, int]:
    labels = [list(label_weights.keys())[0] for label_weights in simple_word_model]
    
    sentence_weights = __generate_sentence_weights(sentence, simple_word_model)
    
    print("Weights distribution for each word:" , sentence_weights)
    
    label_scores = dict()
    label_confidences = dict()
    
    for label in labels:
        label_weight = 0
        for word, word_weights in sentence_weights.items():
            label_weight += (word_weights[label] / sum(list(word_weights.values())))
        label_scores[label] = label_weight

    print("Label scores: ", label_scores)
    
    for label in labels:
        label_confidences[label] = round(label_scores[label] * 100 / sum(list(label_scores.values())), 2)
        
    print("Label confidences (%): ", label_confidences)

    predicted_label =  max(label_confidences.items(), key=operator.itemgetter(1))[0]
    
    prediction_confidence = label_confidences[predicted_label]
    
    print("For the [sentence: ", sentence, "] the predicted label = [", predicted_label, "] with a prediction confidence = [" , prediction_confidence, "%]")
    
    return predicted_label, prediction_confidence

In [17]:
# Training
simple_word_model = build_simple_words_model(train_data_file = 'training_data/sentences.csv')

# Testing
sentence = 'I am good'
predicted_label, prediction_confidence = predict_label_new_sentence(sentence, simple_word_model)


Weights distribution for each word: {'i': {'negative': 3, 'positive': 6}, 'am': {'negative': 0, 'positive': 2}, 'good': {'negative': 0, 'positive': 3}}
Label scores:  {'negative': 0.3333333333333333, 'positive': 2.6666666666666665}
Label confidences (%):  {'negative': 11.11, 'positive': 88.89}
For the [sentence:  I am good ] the predicted label = [ positive ] with a prediction confidence = [ 88.89 %]
