In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
import glob

## Valid Comment Counts

In [2]:
# read in the valid comment counts
comment_counts_valid = pd.read_csv('../../data/eda-data/comment_counts_valid_new.csv',
                                   dtype = {'valid': bool})
comment_counts_valid.head(5)

Unnamed: 0,subreddit,valid,count
0,AmItheAsshole,True,23966604
1,AmItheAsshole,False,1356606
2,AskMen,True,6128483
3,AskMen,False,604704
4,AskWomen,True,1983040


## Valid Submission Counts

In [3]:
# read in the valid submission counts
submission_counts_valid = pd.read_csv('../../data/eda-data/submission_counts_valid_new.csv',
                                   dtype = {'valid': bool})
submission_counts_valid.head(5)

Unnamed: 0,subreddit,valid,count
0,AmItheAsshole,True,115659
1,AmItheAsshole,False,405851
2,AskMen,True,18240
3,AskMen,False,230847
4,AskWomen,True,2717


## Total Valid Post Counts

In [5]:
# aggregate the comment and submission counts
total_counts_valid = pd.merge(left = comment_counts_valid,
                              right = submission_counts_valid,
                              on = ['subreddit', 'valid'])
total_counts_valid['count'] = total_counts_valid['count_x'] + total_counts_valid['count_y']
total_counts_valid = total_counts_valid.drop(columns = ['count_x', 'count_y'])

# renaming to create cleaner plot labels
total_counts_valid['valid'] = total_counts_valid['valid'].replace({True: 'Valid',
                                                                    False: 'Invalid'})
total_counts_valid = total_counts_valid.rename(columns = {'subreddit': 'Subreddit',
                                                          'valid': 'Valid'})

total_counts_valid.head(5)

Unnamed: 0,Subreddit,Valid,count
0,AmItheAsshole,Valid,24082263
1,AmItheAsshole,Invalid,1762457
2,AskMen,Valid,6146723
3,AskMen,Invalid,835551
4,AskWomen,Valid,1985757


In [7]:
domain_ = ['Valid', 'Invalid']
range_ = ['#ff4500', '#7a9299'] # orca: 1b2426

# create the selection and color scheme for interaction
selection = alt.selection_single(fields = ['Valid'], name = 'Random')
color = alt.condition(selection,
                      alt.Color('Valid:N', scale=alt.Scale(domain=domain_, range=range_), legend = None),
                      alt.value('lightgray'))

# create the figure
bar = (alt.Chart(total_counts_valid)
 .mark_bar()
 .encode(y = 'count:Q',
         x = alt.X('Valid:N', sort = domain_),
         color = color,
         column = alt.Column('Subreddit:N', header = alt.Header(labelAngle = -45, labelOrient = 'bottom'))
        )
).add_selection(selection)

bar.title = "Number of Valid and Invalid Comments per Subreddit"
bar.encoding.x.title = ''
bar.encoding.y.title = 'Number of Comments'

validity_counts = (bar).configure_view(strokeOpacity = 0).configure_axis(
    labelFontSize = 12,
    titleFontSize = 12
).configure_title(
    fontSize = 16
)

validity_counts

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
# save off to HTML for use in visualizations page
validity_counts.save('../../website-source/img/eda-plots/validity-counts-new.html')

## Comment Counts by Month

In [22]:
# read in the comment counts by month
comment_month_counts = pd.read_csv('../../data/eda-data/comment_month_counts_new.csv')
comment_month_counts.head(5)

Unnamed: 0,subreddit,month_dt,count
0,AmItheAsshole,4,2231839
1,AmItheAsshole,1,2123706
2,AmItheAsshole,3,2257500
3,AmItheAsshole,11,1943515
4,AmItheAsshole,5,2121000


## Submission Counts by Month

In [21]:
# read in the submission counts by month
submission_month_counts = pd.read_csv('../../data/eda-data/submission_month_counts_new.csv')
submission_month_counts.head(5)

Unnamed: 0,subreddit,month_dt,count
0,AmItheAsshole,4,43613
1,AmItheAsshole,1,35682
2,AmItheAsshole,3,40763
3,AmItheAsshole,11,38754
4,AmItheAsshole,5,46383


## Total Post Counts by Month

In [23]:
# aggregate the comment and submission counts by month
total_month_counts = pd.merge(left = comment_month_counts,
                              right = submission_month_counts,
                              on = ['subreddit', 'month_dt'])
total_month_counts['count'] = total_month_counts['count_x'] + total_month_counts['count_y']
total_month_counts = total_month_counts.drop(columns = ['count_x', 'count_y'])
total_month_counts.head(5)

Unnamed: 0,subreddit,month_dt,count
0,AmItheAsshole,4,2275452
1,AmItheAsshole,1,2159388
2,AmItheAsshole,3,2298263
3,AmItheAsshole,11,1982269
4,AmItheAsshole,5,2167383


In [25]:
# create the selection and color scheme for interaction
selection = alt.selection_single(fields = ['subreddit'], name = 'Random')
color = alt.condition(selection,
                      alt.Color('subreddit:N'),
                      alt.value('lightgray'))

# create the figure
bar = (alt.Chart(total_counts_valid, width = 700, height = 200)
 .mark_bar()
 .encode(y = 'count:Q',
         x = alt.X('subreddit:N', axis = alt.Axis(labelAngle = -45),
         sort = alt.EncodingSortField(field = 'count', op = 'sum', order = 'descending')),
         color = color
        )
).add_selection(selection)

bar.title = "Number of Engagements per Subreddit"
bar.encoding.x.title = 'Subreddit Name'
bar.encoding.y.title = 'Number of Engagements'
# bar

# create color scheme for selection
color2 = alt.condition(selection,
                      alt.Color('subreddit:N'), # selected color
                      alt.value('lightgray')) # remaining colors when one is selected

# create the figure
line1 = (alt.Chart(total_month_counts, width = 700, height = 200)
 .mark_line()
 .encode(x = alt.X('month_dt:O', axis = alt.Axis(labelAngle = 0)),
         y = 'sum(count):Q',
         color = color2,
         strokeWidth = alt.value(2)
        )
).add_selection(selection)

line_highlight = (alt.Chart(total_month_counts, width = 700, height = 200)
 .mark_line()
 .encode(x = alt.X('month_dt:O', axis = alt.Axis(labelAngle = 0)),
         y = 'sum(count):Q',
         color = color2,
         strokeWidth = alt.value(2)
        )
).transform_filter(
    selection
)

line1.title = "Number of Engagements per Subreddit per Month"
line1.encoding.x.title = 'Month (2022)'
line1.encoding.y.title = 'Number of Engagements'
line1.encoding.color.condition.title = 'Subreddit Name'
bar & (line1 + line_highlight)

In [26]:
counts_over_time = (bar & (line1 + line_highlight)).configure_view(strokeOpacity = 0).configure_axis(
    labelFontSize = 12,
    titleFontSize = 12
).configure_title(
    fontSize = 16
)

In [27]:
# save off to HTML for use in visualizations page
counts_over_time.save('../../website-source/img/eda-plots/counts_over_time_new.html')