### This notebook contains html which can only be viewed with nbviewer or by downloading

In [2]:
import re
import bs4
import requests
import json
import multiprocessing as mp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import time
import pickle
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Goals
- Aggregate data from each individual forum to get the most \
important actions to take
- Use opinion networks to see if consensus can be reached
- Model the spread of information, possibly by pulling from wowhead as well
- Investigate friendship communities in posts
- Can we see who liked a given post?

In [3]:
try:
    with open('json10k.p','rb') as f:
        jsonL = pickle.load(f)
except OSError as e:
    print("Failed to load file, see scraping file if you want to produce the file yourself.")

In [4]:
# Total number of pages in the forum
len([x for x in jsonL if x['topic_list']['topics'] != []])

537

In [5]:
topics = sum([js['topic_list']['topics'] for js in jsonL],[])

In [6]:
# Number of topics
len(topics)

16085

### Our goal is to summarize the number of titles calling for game balance changes
We'll do this by using a bar chart race, so we need to:
- Sort posts by created date (they appear by last activity date)
- Turn the relevant titles into a standard format

In [7]:
title_and_date = [(topic['created_at'].split('T')[0],topic['like_count'],topic['title'].lower()) for topic in topics]

In [8]:
topicdf = pd.DataFrame(title_and_date,columns = ['date','likes','title'])
topicdf['date'] = pd.to_datetime(topicdf['date'])

In [9]:
topic_df = topicdf.sort_values(by='date').set_index('date')

### Here we demonstrate the problems with using off-the-shelf sentiment analysis

In [10]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
sid = SentimentIntensityAnalyzer()

In [13]:
for title in topic_df['title'][:10]:
    print(title)
    ss = sid.polarity_scores(title)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

what does "piloted" mean?
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
focus macros?
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
final solution.
compound: 0.3182, neg: 0.0, neu: 0.303, pos: 0.697, 
an open letter to blizzard
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
civelle's pvp community thread
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
greater pyroblast
compound: 0.3612, neg: 0.0, neu: 0.286, pos: 0.714, 
the reason why pvp blows !@#$
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
the legendary ferfykins is
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
7.3 changes to 2200 illusions
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
pvp gearing is bad and not fun (help)
compound: -0.7353, neg: 0.508, neu: 0.492, pos: 0.0, 


### Pretrained models don't work well on video game language

- Words like "fire" and "death" automatically have negative connotations in the model \
even when they're just names of classes in the game
- However, the vast majority of forum posts will be either neutral or negative,\
so sentiment analysis might not be the proper way to evaluate


### Here we begin the process of standardizing titles related to class balance
I'm using some keywords associated to each class's arena toolkit, and trying to include common misspellings. There may be a better way to do this.

In [28]:
def transform_to_class(title):
    title = title.replace(',',' ').replace('.',' ').replace('/','')
    transform_dict = {}
    class_words = {
        'hunter': {'hunter','hunters','hunts','hunt','bm','sv','mm','spirit mend'},
         'warlock': {'warlock','warlocks','locks','destros','lock','destro','aff','demo','chaos bolt'},
    'paladin': {'paladin','pallies','palas','paladins','pala','pal','pally','hpal','ret'},
    'warrior': {'war','wars','warriors','arms','warrior','fury'},
    'mage': {'mage','mages','fire','frost','arcane','gpy','greater pyroblast','pyro'},
    'druid': {'druid','druids','bears','rdruids','rdru','rdruid','feral','bal','balance','guardian',
              'boomkin','lively spirit','bear'},
    'priest': {'priest','priests','hpriest','spriest','shadow','disc','discipline','greater heal'},
    'monk': {'monk','monks','mws','wws','mw','ww','cocoon','cacoon','caccoon','coccoon'},
    'dk': {'dk','dks','fdk','unhdk','unholy','blood','death knight','death strike'},
    'shaman': {'shaman','shamans','rsham','ele','enh','sham','shams',',rshams','earth shock'},
    'rogue': {'rogue','rogues','rouges','assa','asa','assas','sub','rouge','kidney','subtlety','shuriken','poison',
              'poisoned'},
        'dh': {'dh','dhs','demon hunter','blade dance','mana burn'}
    }
    nerf_or_buff = {
    'nerf': {'nerf','op','overpowered','busted','broken','ridiculous','fix','too strong','too high','to strong','to high',
            'dumb','delete'},
    'buff': {'buff','underpowered'}
    }
    flipped_class_words = {val:key for key,valset in class_words.items()
                          for val in valset}
    flipped_nerf_buff_words = {val:key for key,valset in nerf_or_buff.items()
                              for val in valset}
    
    nerfbuff = ''.join(set([flipped_nerf_buff_words.get(word,'')
                       for word in title.split()]))
    classwords = ''.join(set([flipped_class_words.get(word,'') 
                for word in title.split()]))
    if not (nerfbuff and classwords):
        return ''
    else:
        return nerfbuff + ' ' + classwords

In [29]:
topic_df['title'] = topic_df['title'].map(transform_to_class)
topic_df['likes'] += 1

In [30]:
topic_df[topic_df['title'] != '']

Unnamed: 0_level_0,likes,title
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-18,1,nerf rogue
2018-10-28,1,nerf shaman
2018-10-28,1,buff mage
2018-10-29,1,nerf roguemage
2018-10-29,2,buff roguemage
...,...,...
2020-09-03,21,buff mage
2020-09-19,32,nerf dh
2020-09-20,2,buff rogue
2020-09-23,54,nerf hunter


### It appear the forums were overhauled in November 2018, so only posts from mid october of that year onward remain.

In [32]:
data_start = pd.to_datetime('2018-10-15')

In [33]:
nerf_categories = ['nerf hunter',
                  'nerf warlock',
                 'nerf mage',
                  'nerf warrior',
                  'nerf monk',
                  'nerf druid',
                  'nerf rogue',
                  'nerf shaman',
                  'nerf paladin',
                  'nerf dk',
                  'nerf priest',
                   'nerf dh'
                  ]

### Aggregate the data to prepare for bar chart race

In [35]:
def find_class_nerfs(std_title,cls):
    pat = re.compile(r'nerf\s.*'+cls)
    return pat.match(std_title)

def aggregate_monthly(start_date):
    end_date = topic_df.index.max()
    ret_df = pd.DataFrame(index=pd.date_range(start_date,end_date,freq='D'),columns=nerf_categories)
    bin_size = pd.Timedelta(30,unit='days')
    one_day = pd.Timedelta(1,unit='days')
    date = start_date
    while date <= end_date:
        temp_df = topic_df.loc[date:date+bin_size,:]
        counts = np.array([temp_df[temp_df['title'] == cls]['likes'].sum() for cls in nerf_categories])
        ret_df.loc[date,:] = counts
        date += one_day
    return ret_df

def aggregate_total(start_date):
    end_date = topic_df.index.max()
    ret_df = pd.DataFrame(index=pd.date_range(start_date,end_date,freq='D'),columns=nerf_categories)
    one_day = pd.Timedelta(1,unit='days')
    date = start_date
    while date <= end_date:
        temp_df = topic_df.loc[start_date:date,:]
        counts = np.array([temp_df[temp_df['title'] == cls]['likes'].sum() for cls in nerf_categories])
        ret_df.loc[date,:] = counts
        date += one_day
    return ret_df

In [37]:
agg_df = aggregate_total(data_start)

In [38]:
agg_df.index.name = 'date'

In [39]:
agg_df.to_csv('aggdf.csv')

In [40]:
#agg_df = pd.read_csv('aggdf.csv')

In [41]:
#agg_df = agg_df.set_index('date')

### Create rows for transition frames. This is taken from 
https://medium.com/dunder-data/create-a-bar-chart-race-animation-in-python-with-matplotlib-477ed1590096

In [42]:
def prepare_data(df, steps=10):
    df = df.reset_index()
    df.index = df.index * steps
    last_idx = df.index[-1] + 1
    df_expanded = df.reindex(range(last_idx))
    df_expanded['date'] = df_expanded['date'].fillna(method='ffill')
    df_expanded = df_expanded.set_index('date')
    df_rank_expanded = df_expanded.rank(axis=1, method='first',na_option='keep')
    df_expanded = df_expanded.interpolate()
    df_rank_expanded = df_rank_expanded.interpolate()
    return df_expanded, df_rank_expanded

In [44]:
df_expanded, df_rank_expanded = prepare_data(agg_df.astype(np.float))
df_expanded.tail()

Unnamed: 0_level_0,nerf hunter,nerf warlock,nerf mage,nerf warrior,nerf monk,nerf druid,nerf rogue,nerf shaman,nerf paladin,nerf dk,nerf priest,nerf dh
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-10-04,481.0,2114.0,633.0,11.0,668.0,446.0,507.0,194.0,798.0,644.0,264.0,1520.0
2020-10-04,481.0,2114.0,633.0,11.0,668.0,446.0,507.0,194.0,798.0,644.0,264.0,1520.0
2020-10-04,481.0,2114.0,633.0,11.0,668.0,446.0,507.0,194.0,798.0,644.0,264.0,1520.0
2020-10-04,481.0,2114.0,633.0,11.0,668.0,446.0,507.0,194.0,798.0,644.0,264.0,1520.0
2020-10-05,481.0,2114.0,633.0,11.0,668.0,446.0,507.0,194.0,798.0,644.0,264.0,1520.0


In [45]:
from matplotlib.animation import FuncAnimation
colors = plt.cm.tab20(range(len(df_expanded.columns)))
labels = [x.split()[1] for x in df_expanded.columns]
def nice_axes(ax):
    ax.set_facecolor('.8')
    ax.tick_params(labelsize=8, length=0)
    ax.grid(True, axis='x', color='white')
    ax.set_axisbelow(True)
    [spine.set_visible(False) for spine in ax.spines.values()]

def init():
    ax.clear()
    nice_axes(ax)
    ax.set_ylim(.2, len(df_expanded.columns)+0.8)
    ax.set_xlabel('Total post likes')

def update(i):
    for bar in ax.containers:
        bar.remove()
    y = df_rank_expanded.iloc[i]
    width = df_expanded.iloc[i]
    ax.barh(y=y, width=width, color=colors, tick_label=labels)
    date_str = df_expanded.index[i]
    ax.set_title(f'Calls for nerfs by class - {date_str}', fontsize='smaller')
    
fig = plt.Figure(figsize=(6, 4), dpi=300)
ax = fig.add_subplot()
anim = FuncAnimation(fig=fig, func=update, init_func=init, frames=len(df_expanded), 
                     interval=15, repeat=False)

In [47]:
from IPython.display import display,HTML

In [48]:
html = anim.to_html5_video()

In [69]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [67]:
smaller_html = html.replace('height="1200"','height="600"').replace('width="1800"','width="900"')

In [68]:
HTML(smaller_html)

In [51]:
with open('classnerfs.html','w+') as f:
    print(html,file=f)

In [52]:
#anim.save('classnerfs.mp4',dpi=300)