In [1]:
# import sys
# !{sys.executable} -m spacy download en
# nltk.download('vader_lexicon')

from functions import Rating
import nltk
import time
import numpy as np
import spacy
from spacy.util import minibatch
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from joblib import Parallel, delayed
import multiprocessing, threading

# pd.set_option('display.max_rows', 200)

[nltk_data] Downloading package vader_lexicon to C:\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
df = pd.read_json('extracted_comments.json')
df['tag'] = df['tag'].apply(pd.Series)
df.drop(['_id', 'rating'], axis=1, inplace=True)

In [3]:
# with open('extracted_comments.json', 'r') as file:
#     f = json.load(file)
#     print(f[0]['quote'])

#### Counts and average of author/tags

In [4]:
%%time

author_cnt = df.groupby('author')['quote'].count().sort_values(ascending=False).rename('count').to_frame()

tag_cnt = df['tag'].value_counts().rename('count').sort_values(ascending=False).to_frame()
tag_cnt.index.name = 'tag'

def barplot_n_bars(df1, df2, n_bars):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,18))
    barch1 = sns.barplot(x=df1[:n_bars].index, y='count', data=df1[:n_bars], ax=ax1)
    barch2 = sns.barplot(x=df2[:n_bars].index, y='count', data=df2[:n_bars], ax=ax2)
    barch1.set_xticklabels(barch1.get_xticklabels(), rotation=90)
    barch2.set_xticklabels(barch2.get_xticklabels(), rotation=90)
    
    mean1 = df1[:n_bars].mean()[0]
    mean2 = df2[:n_bars].mean()[0]
    
    ax1.axhline(mean1, ls='-', color='black')
    ax2.axhline(mean2, ls='-', color='black')

    ax1.text(n_bars-5, mean1+1.5, f"Average: {mean1}", fontsize=16)
    ax2.text(n_bars-5, mean2+15, f"Average: {mean2}", fontsize=16)

    fig.tight_layout(pad=3.0)
    
barplot_n_bars(author_cnt, tag_cnt, 50)

count    4.547224
dtype: float64

### <b>Rating performance test</b>

In [7]:
def sentiment_entity(group, cls):
    results = list()
    for text in group:
        results.append((cls.sentiment_rating(text), cls.entity_rating(text)))
    return results

def compute_ratings(n_items, n_jobs, n_batch_size):
    process_rating = Rating()
    quotes = process_rating.get_sentences(df['quote'][:n_items])
    f = delayed(partial(sentiment_entity, cls=process_rating))
    executor = Parallel(n_jobs=n_jobs)
    tasks = (f(text_chunk) for text_chunk in minibatch(quotes, size=n_batch_size))
    result = executor(tasks)
    del process_rating
    return result

In [8]:
def calculate_run_time(n_items, n_jobs, n_batches, set_batch_size=True):   
    rng_items = np.array(n_items)
    rng_jobs = np.array(n_jobs)
    if not set_batch_size:
        rng_batch_size = np.ceil(np.linspace(1, rng_items.max()/2, n_batches))
    else:
        rng_batch_size = np.array(n_batches)
    params = np.array(np.meshgrid(rng_items, rng_jobs, rng_batch_size)).T.reshape(-1,3)
    
    runtime_results = list()
    for item, job, bsize in params:
        start = time.time()
        _ = compute_ratings(int(item), int(job), int(bsize))
        end = time.time()
        elapsed = end - start
        runtime_results.append([item, job, bsize, elapsed])
    return runtime_results

def plot_run_times(results, run, figsize=(25,10)):
    df = pd.DataFrame(results, columns=['Document Count', '# CPU Processes', 'Batch Size', 'Processing Time (s)'])
    g = sns.FacetGrid(df, col='# CPU Processes', hue='Batch Size')
    g.map_dataframe(sns.lineplot, x='Document Count', y='Processing Time (s)')
    g.set_axis_labels('Number documents processed', 'Total elapsed processing time')
    g.add_legend()
    g.fig.set_size_inches(25,10)
    g.savefig('performance_{}_{}.jpg'.format(run, str(n_items[len(n_items)-1])))

#### General performance testing

In [9]:
# %%time

# n_items = [1, 50, 100, 500, 1000]
# n_jobs = [4, 6, 8, 10]
# n_batches = 5

# results = calculate_run_time(n_items, n_jobs, n_batches)
# plot_run_times(results, 1)

# print('Done')

In [10]:
# performance = pd.DataFrame(results, columns=['Document Count', '# CPU Processes', 'Batch Size', 'Processing Time (s)'])
# performance.to_csv('performance_{}.csv'.format(str(n_items[len(n_items)-1])), index=False)

#### Performance testing with 8 processes

In [11]:
# %%time

# n_items = [1000, 5000, 10000, 20000]
# n_jobs = 8
# n_batches = [125, 250]

# results2 = calculate_run_time(n_items, n_jobs, n_batches, True)
# plot_run_times(results2, 2)

# print('Done')

In [12]:
# performance2 = pd.DataFrame(results2, columns=['Document Count', '# CPU Processes', 'Batch Size', 'Processing Time (s)'])
# performance2.to_csv('performance2_{}.csv'.format(str(n_items[len(n_items)-1])), index=False)

### <b>Run full results</b>

In [21]:
%%time

full_results = compute_ratings(df.shape[0], 8, 250)

Wall time: 52min 10s


In [65]:
full_results_unravel = [res for results in full_results for res in results]
df_results = pd.DataFrame(full_results_unravel, columns=['sentiment', 'entity'])

In [75]:
merged = pd.concat([df, df_results], axis=1)
merged.to_pickle('calculated_ratings.pkl', compression='gzip')