In [180]:
base = '/u/ebanner/Classes/nlp/Project/irony-classifier'

In [152]:
base = '/u/npockrus/NLP/finalProject/venv/src/irony-classifier'

# Load Subreddits and Labels

In [181]:
cd /{base}/data/conservative/features/text+sentiment+subreddit+label+progressiviness/

/v/filer4b/v20q001/ebanner/Classes/nlp/Project/irony-classifier/data/conservative/features/text+sentiment+subreddit+label+progressiviness


In [182]:
import pickle
import scipy
import numpy as np

with open('features.p', 'r') as f:
    data = pickle.load(f)

subreddits = np.array([data[sentence]['subreddits'] for sentence in sorted(data)])
ys = np.array([ data[sentence]['label'] for sentence in sorted(data) ])

# Frequency Counts

In [183]:
from collections import defaultdict

all = defaultdict(int)
for subreddit_list in subreddits:
    for subreddit in subreddit_list:
        all[subreddit] += 1

# Get Breakdown for Every Subreddit

In [184]:
from collections import defaultdict

ironics = defaultdict(int)
for subreddit_list, y in zip(subreddits, ys):
    if y == 1:
        for subreddit in subreddit_list:
            ironics[subreddit] += 1

In [185]:
genuines = {}
for subreddit in all:
    genuines[subreddit] = all[subreddit] - ironics[subreddit]

In [186]:
for subreddit in all:
    assert genuines[subreddit] + ironics[subreddit] == all[subreddit]

# Generate Parallel List for All, Ironics, and Genuines

In [187]:
subreddit_list = [0]*len(all)
ironic_list, genuine_list, all_list = [0]*len(all), [0]*len(all), [0]*len(all)
for i, pair in enumerate(sorted(all.items(), key=operator.itemgetter(1), reverse=True)):
    subreddit, count = pair
    
    subreddit_list[i] = subreddit
    ironic_list[i], genuine_list[i], all_list[i] = ironics[subreddit], genuines[subreddit], all[subreddit]
    
# Sanity Check
for subreddit, irony, genuine, a in zip(subreddit_list, ironic_list, genuine_list, all_list):
    assert irony + genuine == a
    assert ironics[subreddit] + genuines[subreddit] == all[subreddit]

# Normalize

In [188]:
normalized_ironics = [ (count+1)/(float(num_ironics)+1) for count in ironic_list ]
normalized_genuines = [ (count+1)/(float(num_genuines)+1) for count in genuine_list ]

In [240]:
zip(subreddit_list, normalized_ironics, normalized_genuines)

[(u'politics', 1.0, 1.0),
 (u'AskReddit', 0.7724137931034483, 0.8142034548944338),
 (u'funny', 0.8344827586206897, 0.7976967370441459),
 (u'worldnews', 0.8, 0.7946257197696737),
 (u'pics', 0.7655172413793103, 0.74510556621881),
 (u'videos', 0.6482758620689655, 0.6261036468330134),
 (u'WTF', 0.6827586206896552, 0.6030710172744722),
 (u'todayilearned', 0.5793103448275863, 0.5911708253358925),
 (u'AdviceAnimals', 0.6275862068965518, 0.5285988483685221),
 (u'gaming', 0.4827586206896552, 0.48023032629558543),
 (u'news', 0.496551724137931, 0.46449136276391556),
 (u'technology', 0.4827586206896552, 0.45911708253358924),
 (u'IAmA', 0.3103448275862069, 0.43723608445297507),
 (u'atheism', 0.42758620689655175, 0.42610364683301344),
 (u'science', 0.2689655172413793, 0.3666026871401152),
 (u'movies', 0.35172413793103446, 0.33205374280230326),
 (u'aww', 0.32413793103448274, 0.28598848368522073),
 (u'conspiracy', 0.2, 0.15930902111324377),
 (u'Frugal', 0.1793103448275862, 0.1397312859884837),
 (u'Jus

# Compute Similarity + Smoothing

In [227]:
divergences = [ float(ironic)/genuine for ironic, genuine in zip(normalized_ironics, normalized_genuines) ]

# Most Divergent Subreddits (Low)

In [250]:
# Number of Subreddits
N = 10

In [251]:
top_subreddits = [ subreddit for divergence, subreddit in sorted(zip(divergences, subreddit_list)) ][:N]
top_divergences = [ divergence for divergence, subreddit in sorted(zip(divergences, subreddit_list)) ][:N]

In [254]:
import matplotlib.pyplot as plt

fig = plt.figure()
axes = fig.add_subplot(111)

indicies = np.arange(N)  # Offsets for the bars
width = .8  # Width of the bars

rectangles = axes.bar(range(len(top_subreddits[:N])), top_divergences[:N], width, align='center')

axes.set_xticks(indicies)
axes.set_xticklabels(top_subreddits[:N])

plt.xlabel('Most Popular Subreddits')
plt.ylabel('Divergence')
plt.grid(True)

def autolabel(rects):
    strings = [ str(all[subreddit]) for subreddit in top_subreddits[:N] ]
    
    for ii, pair in enumerate(zip(rects, strings)):
        rect, string = pair
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2., 1.02*height, string, ha='center', va='bottom')
        
autolabel(rectangles)
    
plt.show()

# Most Divergent Subreddits (High)

In [247]:
N = 10

In [248]:
bottom_subreddits = [ subreddit for divergence, subreddit in sorted(zip(divergences, subreddit_list), reverse=True) ][:N]
bottom_divergences = [ divergence for divergence, subreddit in sorted(zip(divergences, subreddit_list), reverse=True) ][:N]

In [253]:
import matplotlib.pyplot as plt

fig = plt.figure()
axes = fig.add_subplot(111)

indicies = np.arange(N)  # Offsets for the bars
width = .8  # Width of the bars

rectangles = axes.bar(range(len(bottom_subreddits[:N])), bottom_divergences[:N], width, align='center')

axes.set_xticks(indicies)
axes.set_xticklabels(top_subreddits[:N])

plt.xlabel('Most Popular Subreddits')
plt.ylabel('Divergence')
plt.grid(True)

def autolabel(rects):
    strings = [ str(all[subreddit]) for subreddit in bottom_subreddits[:N] ]
    
    for ii, pair in enumerate(zip(rects, strings)):
        rect, string = pair
        height = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2., 1.02*height, string, ha='center', va='bottom')
        
autolabel(rectangles)
    
plt.show()

# Divergences for All Subreddits -- Most Common First

In [256]:
N = len(all)

In [257]:
import matplotlib.pyplot as plt

fig = plt.figure()
axes = fig.add_subplot(111)

indicies = np.arange(N)  # Offsets for the bars

rectangles = axes.bar(range(N), [ divergence-1 for divergence in divergences], align='center')

plt.xlabel('Most Popular Subreddits')
plt.ylabel('Divergence')
plt.grid(True)
    
plt.show()

# Horizontal Bar Chart

In [15]:
import matplotlib
from pylab import *

val = 3-6*rand(5)    # the bar lengths        # changed your data slightly
pos = arange(5)+.5    # the bar centers on the y axis
print pos
figure(1)
barh(pos,val, align='center',height=0.1)    # notice the 'height' argument
yticks(pos, ('Tom', 'Dick', 'Harry', 'Slim', 'Jim'))

gca().axvline(0,color='k',lw=3)   # poor man's zero level

xlabel('Performance')
title('horizontal bar chart using matplotlib')
grid(True)
show()

[ 0.5  1.5  2.5  3.5  4.5]
