# Polarity Ranking Algorithm

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm 

import re
import nltk
import string
from nltk.corpus import stopwords

import gensim.downloader as api

from snorkel.labeling import labeling_function

## Load Dataset

The Pandas DataFrame will consist of the following fields: 
* **Source**
* **Title** 
* **Content**
* **Bias**

## Labeling Functions

In [2]:
# Define the label mappings for convenience
ABSTAIN = -1
LEFT = 0
CENTER = 1
RIGHT = 2

In [3]:
@labeling_function()
def lf_source_eval(x): 
    if x.bias == 'hyper_left':
        return LEFT 
    
@labeling_function()
def lf_contains_left_keywords(x): 
    # todo: if the article contains a substantial amount
    # of these phrases, then classify as left, else, center
    return ABSTAIN

@labeling_function()
def lf_contains_right_keywords(x): 
    # todo: if the article contains a substantial amount
    # of these phrases, then classify as left
    return ABSTAIN

## Machine Learning Model

This deep learning model accepts as input a tokenized corpus of text (news article) and learns to output a polarity ranking on a scale from 0 to 1.

The steps for the deep learning model are outlined below: 
* Have pandas df with text, and label
* Preprocess raw article text and tokenize. 
* Embedding it using Word2Vec method 
* Use gensim word encoder and average all the vectors from the article.
* Train a logistic regression model with all features

Initial Task: Binary Classifcation (Left vs. Right)

### Data Preprocessing
* May want to experiment with: https://github.com/UKPLab/sentence-transformers for sentence embedding instead of word by word embedding
* Also could try Big Bird Transformer if we have time. 

In [4]:
data_path = ''

In [5]:
def get_data(): 
    data = pd.read_csv(data_path)
    
    content = data['content']
    labels = data['bias']
    return x, y

In [18]:
## Output text and labels 
def load_wv():
    # this line of code needs to run asynchronously first
    wv = api.load('word2vec-google-news-300')
    return wv

def clean_text(x): 
    '''
    This function takes in raw text as a string, and cleans it
    by removing extraneous text. 
    '''
#     nltk.download()
#     stop_words = stopwords.words("english")
    x = x.lower()
#     x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

def preprocess_text(text): 
    X = []
    found_words = []

    text = clean_text(text)
    words = text.split()
    for word in words: 
        try:
            # creates 300 vectors per word
            found_words.append(wv[word])
#             print('here')
        except: 
            continue
      
    embedding = np.asarray(found_words)
    mean = np.mean(embedding, axis=0)
    mean = mean.tolist()

    if type(mean) == list: 
        X.append(mean)

    X = np.array(X)
    return X

def preprocess_labels(labels): 
    '''
    This function converts text labels to numerical labels 
    '''
    return None

In [14]:
## todo: for gautham, load this on init
# wv = load_wv()

In [15]:
text = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."

In [19]:
xx = preprocess_text(text)

In [21]:
print(xx.shape)

(1, 300)


In [10]:
def create_dataset(X, y):
    X_dset = []
    y_dset = []
    # for each transcript and label
    for text, lbl in tqdm(zip(X, y)):
        X_processed = preprocess_text(text)
        X_processed = X_processed.tolist()
        
        if type(mean) == list: 
            X_dset.append(X_processed)
            y_dset.append(lbl)
    
    X_dset = np.asarray(X_dset)
    y_dset = np.array(y_dset)
    return X_dset, y_dset

In [11]:
X_dset, y_dset = create_dataset(x, y)

NameError: name 'x' is not defined

In [None]:
print('X_dset shape: ', X_dset.shape)
print('y_dset shape: ', y_dset.shape)

### Train Model 
* Train model in Keras 

In [35]:
def split_dataset(X_dset, y_dset, val_size=0.15, test_size=0.15): 
    X_train, X_test, y_train, y_test = train_test_split(
        X_dset, y_dset, test_size=test_size, random_state=42
    )
    
    print('X_train shape: ', X_train.shape)
    print('y_train shape: ', y_train.shape)
    
    print('X_test shape: ', X_test.shape)
    print('y_test shape: ', y_test.shape)
    return X_train, X_test, y_train, y_test

def create_dsets(X_train, X_test, y_train, y_test, batch_size=16): 
    train_dset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
    test_dset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
    return train_dset, test_dset

In [None]:
X_train, X_test, y_train, y_test = split_dataset(X_dset, y_dset)
train_dset, test_dset = create_dsets(X_train, X_test, y_train, y_test)

In [34]:
HIDDEN_UNITS = 128
EPOCHS = 25

In [36]:
def create_model(hidden_units): 
    model = tf.keras.Sequential()
    input_layer = tf.keras.layers.Input(shape=(300,))
    hidden_layer_1 = tf.keras.layers.Dense(hidden_units, activation='relu')
    hidden_layer_2 = tf.keras.layers.Dense(hidden_units, activation='relu')
    # hidden_layer_3 = tf.keras.layers.Dense(hidden_units/2, activation='relu')
    output_layer = tf.keras.layers.Dense(3, activation='softmax')
    model.add(input_layer)
    model.add(hidden_layer_1)
    model.add(hidden_layer_2)
    # model.add(hidden_layer_3)
    model.add(output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', 
                  metrics=['binary_accuracy'])
    model.summary()
    return model

model = create_model(HIDDEN_UNITS)

In [33]:
def plot(p1, p2=None, title='Plot', x_label='', y_label='', p1_legend=None, p2_legend=None): 
    plt.figure()
    plt.plot(range(len(p1)), p1, label=p1_legend)
    if p2 is not None: 
        plt.plot(range(len(p2)), p2, label=p2_legend)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    plt.show()   

In [None]:
hist = model.fit(train_dset, validation_data=test_dset, epochs=EPOCHS)

In [None]:
plot(hist.history['loss'], p2=hist.history['val_loss'], 
             title='Training and Validation Loss', x_label='Epoch', y_label='Loss Value', 
            p1_legend='Training loss', p2_legend='Validation loss')

### Model Save

In [None]:
model_filepath = './polarity_model'
model.save(model_filepath)

### Model Evaluation

In [None]:
y_pred = model.predict(test_dset)

### Model Implementation

In [None]:
async def ai_impression(transcript, wv):
    embedding = preprocess_text()

    model_filepath = './static/models/ted_analysis_model'
    model = tf.keras.models.load_model(model_filepath)
    
    pred = model.predict(embedding)[0]
    cols = ['Beautiful', 'Confusing', 'Courageous', 'Funny', 'Informative', 'Ingenious', 'Inspiring', 'Longwinded', 'Unconvincing', 'Fascinating', 'Jaw-dropping', 'Persuasive', 'OK', 'Obnoxious']

    ted_dict = {}
    for val, col in zip(pred, cols):
        ted_dict[col] = val
    
    word_result = max(ted_dict, key=ted_dict.get)

    return word_result

In [None]:
def get_polarity(text, model_filepath): 
    '''
    This function takes in the raw text of an article, preprocesses it, and computes
    a polarity score as a double from the deep learning model. 
    '''
    
    embedding = preprocess_text(text)
    
    model = tf.keras.models.load_model(model_filepath)
    pred = model.predict(embedding)[0]
    
    def compute_polarity(pred): 
        '''
        This function computes a weighted sum of labels to get a quantitative
        polarity score.
        '''
        left = pred[0]
        center = pred[1]
        right = pred[2]
        
        polarity = left*0 + center*(0.5) + right*(1)
        return polarity
    
    polarity = compute_polarity(pred)
    
    return polarity

In [None]:
model = keras.load_model(model_filepath)

# polarity = get_polarity(text, model)

In [None]:
def init_alg():
    wv = api.load('word2vec-google-news-300')