This is an exploration of headlines from rAskReddit.

In [1]:
from IPython.display import Markdown, display
import sympy
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
import random
from os import listdir
from typing import *

class Headline:
    touch_ups = dict( {
        "what?s" : "what's",
        "who?s" : "who's",
        "you?ve" : "you've",
        "don?t" : "don't",
        "redditor" : "reddit",
        "reddit!" : "reddit",
        "serie" : "series"
        })
    
    def __init__(self,raw_line,sanitize_fn: Callable[[str],str]) -> None:
        self.words = [sanitize_fn(word) for word in raw_line.split(" ")]
        
    def __len__(self):
        return len(self.words)
    def __str__(self):
        return str(self.words)
  
    @staticmethod
    def sanitize(s: str) -> str:
        junk = "!?:,\"\'()\n.0123456789*"
        s = s.lower().rstrip(junk).lstrip(junk)
        #s = s.rstrip(junk)
        ##s = s.lstrip(junk)
    
        
        for k,v in Headline.touch_ups.items():
            if s == k:
                s = v
                
        return s
    
#helper functions
def as_pct(data: float) -> str:
    return f"{data*100}%"
def printmd(string: str) -> str:
    display(Markdown(string))
    
def ffile_type(path: str,ext:str ) -> str:
    for entry in listdir(path):
        if entry[-3:] == ext:
            yield entry
def read_chunk(path: str) -> List[str]:
    for entry in ffile_type(path,"csv"):
        with open(entry,'r') as file:
            data = file.readlines()
        yield data
def sanitize(s: str) -> str:
    junk = "!?:,\"\'()\n.0123456789*"
    s = s.lower().rstrip(junk).lstrip(junk)
    for k,v in Headline.touch_ups.items():
        if s == k:
            s = v
                
    return s
    


# Read Data

In [6]:
headlines: List[str]
with open("./all.csv",'r') as file:
    headlines = [Headline(line,sanitize) for line in file]

print(f"Read {len(headlines)} headlines.")
print(f"{len(set(headlines))} unique headlines.")

Read 410813 headlines.
410813 unique headlines.


n_total = len(sentences)
n_unique = len(unique_sentences)
pct_unique = as_pct(n_unique / n_total)

n_samples = len(listdir("./askreddit/"))
printmd(f"**{n_samples}** samples")
printmd(f"You have collected **{len(sentences)}** headlines, **{len(unique_sentences)}** (**{pct_unique}**) of which are unique")

# Calculating the Relative Frequency of a Word

In [None]:
all_words = [h_line.words for h_line in headlines]
word = 'you'
words = pd.Series(all_words)
rel_freqs = words.value_counts(normalize=True)
#rel_freqs[word] 
printmd(f"The word **{word}** occurs about **{as_pct(rel_freqs[word])}** of the time.")
printmd(f"The most common word in an r/AskReddit headline is **{rel_freqs.idxmax()}**, which occurs about **{as_pct(rel_freqs.max())}** of the time.")
printmd(f"**Top ten most frequent words**")
print(f"{rel_freqs[39:49]*100}")

## Average Length of Headlines (Population)

In [None]:
#TODO: FIX
unique_sentence_lens = [len(s) for s in unique_sentences]
mean = round(np.mean(unique_sentence_lens))
median = round(np.median(unique_sentence_lens))
printmd(f"The average length of the headline on r/AskReddit is **{mean}** words" )
printmd(f"The median length of the headline on r/AskReddit is **{median}** words" )


## Standardize the Data

In [None]:

'''
sentence_lens = sorted(sentence_lens)
mu = np.mean(sentence_lens)
std = np.std(sentence_lens)
sentence_lens = [ (s - mu) / std for s in sentence_lens]
'''


## Distribution of Length of Headlines (Population)

In [None]:
unique_sentence_lens = pd.Series(unique_sentence_lens)
unique_sentence_lens.hist()
plt.title(f"Length of Headlines (words) on r/AskReddit (N={len(unique_sentences)})")
plt.show()
printmd(f"$\sigma$ = {unique_sentence_lens.std()}")
printmd(f"$\mu$ = {unique_sentence_lens.mean()}")


# Generating the Sample Distribution of the Sample Mean

In [None]:
sentence_dict = [(item.words,len(item)) for item in sentences]

In [None]:
sentence_dict[0]

In [None]:
sample_size = 30
n_trials = int(1e3)
sample_means = []
over_time = []
error = []



sentence_dict = {}
for i in range(0,n_trials):
    sample_keys = random.sample(list(sentence_dict),sample_size)
    sample = [sentence_dict[key] for key in sample_keys]
    sample_sentence_lens = [len(s) for s in sample]
    sample_sentence_len_mean = np.mean(sample_sentence_lens)
    sample_means.append(sample_sentence_len_mean)
    over_time.append(np.mean(sample_means))
    error.append( abs(np.mean(sample_means) - unique_sentence_lens.mean()))
plt.title(f"Mean of Sampling Distribution of the Sample Mean vs. Number of Trials(Normalized; n={sample_size}; N={n_trials})")
sample_means = pd.Series(sample_means)
plt.xlabel("Number of Trials")
plt.ylabel("Sample Mean (Cumulative)")
plt.plot(over_time)
printmd(f"Sample mean converges to {over_time[-1]}")


In [None]:
plt.title(f"Sampling Distribution of the Sample Mean(Normalized; n={sample_size}; N={n_trials})")
sample_means.hist()
printmd("$\mu_{\sigma}$ = " + str(sample_means.std()))
printmd("$\mu_{\overline{x}}$ = " + str(sample_means.mean()))

In [None]:
plt.title("Abs. Error vs. N Trials")
plt.xlabel("N Trials")
plt.ylabel("Abs Error")
plt.plot(error)
printmd(f"Abs. Error converges to {as_pct(error[-1])}")

In [None]:
'''

mu = np.mean(sample_means)
sigma = np.std(sample_means)
plt.title(f"Sample Distribution of Sample Mean (Normalized; n={k}; N={N})")
sample_means = pd.Series([(s - mu) / sigma for s in sample_means])
sample_means.hist()
'''

In [None]:
import wordcloud
from wordcloud import WordCloud

# Read the whole text.
text = " ".join({w for w in words})


# Generate a word cloud image
wordcloud = WordCloud(background_color="white",max_font_size=40,scale=3).generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
ax = plt.figure(figsize=(20,20))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


# The pil way (if you don't have matplotlib)
# image = wordcloud.to_image()
# image.show()