In [None]:
import json
import pprint

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

def load_corpus(corpus_path):
    """Load and preprocess the data from a JSON file."""
    with open(corpus_path, 'r', encoding='utf-8') as f:
        corpus = json.load(f)
    return corpus

In [None]:
def convert_annotations(corpus):
    """Convert annotations in the corpus to a standard format."""
    annotation_convert = {
        'none': 'none',
        'applause': 'applause',
        'laughter': 'laughter',
        'laughing': 'laughter',
        'laughs': 'laughter',
        'laughter applause': 'laughter applause',
        'laughter) (applause': 'laughter applause',
        'audience gasps': 'gasp',
        'audio': 'audio',
        'gasping': 'gasp',
        'mock sob': 'gasp',
    }
    for item in corpus:
        for sentence in item['transcript']:
            annotation = sentence['annotation']
            if annotation in annotation_convert:
                sentence['annotation'] = annotation_convert[annotation]
            else:
                sentence['annotation'] = 'none'

Corpus size:  59
{   'FKRE_rating': 'Fairly easy',
    'FKRE_score': 70.0,
    'Length': '00:15:16',
    'NAWL': 12,
    'NGSL': 458,
    'URL': 'https://www.ted.com/talks/aaron_huey',
    'WPM': 146.0,
    'like_count': 59000,
    'raw_transcript': "I\\'m here today to show my photographs of the Lakota. "
                      'Many of you may have heard of the Lakota, or at least '
                      'the larger group of tribes, called the Sioux. The '
                      'Lakota are one of many tribes that were moved off their '
                      'land to prisoner-of-war camps, now called reservations. '
                      "The Pine Ridge Reservation, the subject of today\\'s "
                      'slide show, is located about 75 miles southeast of the '
                      'Black Hills in South Dakota. It is sometimes referred '
                      'to as Prisoner of War Camp Number 334, and it is where '
                      'the Lakota now live. Now, if any of 

In [None]:
def extract_features_labels(corpus):
    """Extract features and labels from the corpus."""
    X = []
    y = []
    for data in corpus:
        features = [
            data["FKRE_score"],
            data["NAWL"],
            data["NGSL"],
            data["WPM"],
            score(data)
        ]
        X.append(features)
        y.append(data["like_count"] / data["view_count"])
    X = np.array(X)
    y = np.array(y)

    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    y = scaler.fit_transform(y.reshape(-1, 1))

    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    y = np.reshape(y, (y.shape[0], y.shape[1], 1))

    print("X shape:", X.shape)
    print("y shape:", y.shape)

    return X, y

Train corpus size:  47
Test corpus size:  12


In [None]:
def score(item):
    """Calculate the score based on annotations."""
    annotations = {}
    for sentence in item['transcript']:
        annots = sentence['annotation'].split(" ")
        for annot in annots:
            annotations[annot] = annotations.get(annot, 0) + 1

    reactions = {'applause': 3, 'laughter': 2, 'none': 0, 'gasp': 1, 'audio': 0}
    total_score = 0
    for key in annotations:
        total_score += annotations[key] * reactions[key]

    return total_score

X shape:  (59, 5, 1)
y shape:  (59, 1, 1)
[[0.90178075]
 [0.58097821]]
[0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0
 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 1 1]
Good examples:  (41, 5, 1) (41, 1, 1)
Bad examples:  (18, 5, 1) (18, 1, 1)
Train corpus size:  46
Test corpus size:  13




In [None]:
def train_model(X_train, y_train):
    """Build and train the LSTM model."""
    regressor = Sequential()
    regressor.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units=50, return_sequences=True))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units=50, return_sequences=True))
    regressor.add(Dropout(0.2))
    regressor.add(LSTM(units=50))
    regressor.add(Dropout(0.2))
    regressor.add(Dense(units=1))

    regressor.compile(optimizer='adam', loss='mean_squared_error')
    regressor.fit(X_train, y_train, epochs=30, batch_size=32)

    return regressor

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x3f9f5a910>

In [None]:
def predict_and_evaluate(regressor, X_test, y_test):
    """Predict and evaluate the model."""
    y_pred = regressor.predict(X_test)
    y_pred_normal = np.squeeze(y_pred)
    y_test_normal = np.squeeze(y_test)

    for i, y_pred_val in enumerate(y_pred_normal):
        print("Predicted:", y_pred_val, "Actual:", y_test_normal[i])

    mse = mean_squared_error(y_test_normal, y_pred_normal)
    print("MSE:", mse)

# Load and preprocess data
corpus_path = "data.json"
corpus = load_corpus(corpus_path)
convert_annotations(corpus)
X, y = extract_features_labels(corpus)

# Perform KMeans clustering with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(np.reshape(y, (y.shape[0], y.shape[1])))

# Get the good and bad examples
good_examples_X = X[kmeans.labels_ == 0]
bad_examples_X = X[kmeans.labels_ == 1]
good_examples_y = y[kmeans.labels_ == 0]
bad_examples_y = y[kmeans.labels_ == 1]

print("Good examples:", good_examples_X.shape, good_examples_y.shape)
print("Bad examples:", bad_examples_X.shape, bad_examples_y.shape)

# Split the corpus into train and test sets
X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(good_examples_X, good_examples_y, test_size=0.2, shuffle=False)
X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(bad_examples_X, bad_examples_y, test_size=0.2, shuffle=False)

# Concatenate the good and bad examples
X_train = np.concatenate((X_train_good, X_train_bad))
X_test = np.concatenate((X_test_good, X_test_bad))
y_train = np.concatenate((y_train_good, y_train_bad))
y_test = np.concatenate((y_test_good, y_test_bad))

print("Train corpus size:", len(X_train))
print("Test corpus size:", len(X_test))

# Reshape the target data
y_train = np.squeeze(y_train, axis=2)

# Build and train the LSTM model
regressor = train_model(X_train, y_train)

# Predict and evaluate the model
predict_and_evaluate(regressor, X_test, y_test)

Predicted:  0.8053619 Actual:  0.9658741514367364
Predicted:  0.8053388 Actual:  0.7886952575053634
Predicted:  0.8309978 Actual:  0.7682944431620982
Predicted:  0.767225 Actual:  0.8298864282842473
Predicted:  0.83407086 Actual:  0.8612149216217624
Predicted:  0.8452372 Actual:  0.9797058780043031
Predicted:  0.7858926 Actual:  0.8718011616375634
Predicted:  0.6905354 Actual:  0.8475568529405422
Predicted:  0.83548427 Actual:  0.8589936067097064
Predicted:  0.78655136 Actual:  0.6270822193110455
Predicted:  0.8672672 Actual:  0.18890255913671794
Predicted:  0.77725005 Actual:  0.6888604845972139
Predicted:  0.73731863 Actual:  0.6115924630986136
MSE:  0.045733495155511804
