In [1]:
#Make output bigger for jupyter notebook
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 60em; }</style>"))

In [2]:
import time
import sys
import pycurl
import certifi
import re
from io import BytesIO 
import nltk, re, string, collections
from nltk.util import ngrams


#Options
dataURL = 'https://www.reddit.com/r/space/.json?count=250'
killCase =            1      #Make everything lowercase
parse =               1      #Basic parsing using parseRegex variable
replacebadRegex =     0      #Replace pattern in badRegex
replacebadChars =     1      #Replace characters in badChars
onlybasicChars =      0      #Only alphanumeric and spaces
splitbychar =         0      #Split by character (default is space)
ngramMin =            2      #Smallest ngram to look for
ngramMax =            20     #Largest ngram to look for
minngramCount =       2      #Minimum number of times an ngram must be repeated
ngramRedundant =      1      #Show non-unqiue ngrams inside of other larger ones
mostcommonMax =     100      #How many ngrams max to return from most_common()
responsesizeMax = 50000      #Largest response size to show
ignoreSSLwarn =       0      #Don't check SSL certificates for curl
showresponses =       1      #Show curl response and data


#Regex matches
badChars = [';', ':', '!', "*", "<", ">", "-", "\"", "="] 
badRegex = re.compile(r"<[^>]*>")
parseRegex = re.compile(r'"title": "(.*?)"', flags=re.I|re.M) #reddit titles
#badRegex = re.compile(r'wiki')
#badRegex = re.compile(r".*?<body>(.*?)</body>")
alphanumspaceRegex = re.compile(r"[^a-zA-Z0-9\s]")

#Curl page for data
b_obj = BytesIO() 
crl = pycurl.Curl() 
crl.setopt(crl.URL, dataURL)
if ignoreSSLwarn == 1:
    crl.setopt(pycurl.SSL_VERIFYPEER, 0) #trust invalid SSL
    crl.setopt(pycurl.SSL_VERIFYHOST, 0) #trust invalid SSL
crl.setopt(crl.WRITEDATA, b_obj)
crl.setopt(pycurl.USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0")
crl.setopt(pycurl.CAINFO, certifi.where())
crl.perform() 
crl.close()
get_body = b_obj.getvalue()
curlResponse = str(get_body.decode('utf8'))

#Print responses if enabled
if showresponses == 1:
    curlResponseSize = sys.getsizeof(curlResponse)
    #Don't print huge amounts of data
    if (curlResponseSize < responsesizeMax):
        print(curlResponse)
    else:
        print("Curl response too big for display, " + str(curlResponseSize) + " bytes, max is " + str(responsesizeMax))

Curl response too big for display, 117333 bytes, max is 50000


In [3]:
start = time.time()
data = curlResponse
combinedngram = ''

getdataSizeKB = lambda inputdata : str(str(round(sys.getsizeof(inputdata) / 1024, 2)) + "\tKB")

if killCase == 1:
    data = data.lower()

if parse == 1:
    #print(data)
    print("Matching regex: " + str(parseRegex.pattern))
    print("Before size:\t" + getdataSizeKB(data))
    parsedfindall = sorted(set(re.findall(parseRegex, data))) #sort/uniq to remove repeated found patterns of same text
    parsedData = ''
    for item in parsedfindall:
        parsedData = parsedData + str(item) + "\n"
    data = parsedData
    print("After size:\t" + getdataSizeKB(data))

if replacebadRegex == 1:
    print("\nRemoving data matching regex: " + str(badRegex.pattern))
    print("Before size:\t" + getdataSizeKB(data))
    data = re.sub(badRegex, ' ', data) 
    print("After size:\t" + getdataSizeKB(data))

if replacebadChars == 1:
    print("\nRemoving the following chars: " + str(badChars))
    print("Before size:\t" + getdataSizeKB(data))
    for i in badChars : 
        data = data.replace(i, '') 
    print("After size:\t" + getdataSizeKB(data))

if onlybasicChars == 1:
    print("\nRemoving non-alphanumeric except spaces: " + str(alphanumspaceRegex.pattern))
    print("Before size:\t" + getdataSizeKB(data))
    "".join(i for i in data if ord(i)<128)
    data = re.sub(alphanumspaceRegex, '', data)
    print("After size:\t" + getdataSizeKB(data))

#Single character splitting vs default of space
if splitbychar == 1:
    print("Splitting by character")
    def split(word): 
        return [char for char in word]  
    tokenized = split(data)
else: 
    tokenized = data.split()
    #print(tokenized)
    
ngramstart = time.time()
#iterate ngram length from ngramMin to ngramMax
for i in range(ngramMax,(ngramMin - 1),-1):
    #print('ngram length:' + str(i) + '\n')
    ngramResults = ngrams(tokenized, i)
    ngramFreq = collections.Counter(ngramResults)
    ngramCommon = ngramFreq.most_common(mostcommonMax)
    #print(ngramCommon)
    #Split results for text comparison
    for list in ngramCommon:
        ngramStr = ' '.join(str(character) for character in list[0])
        ngramCount = list[1]
        if ngramRedundant == 1:
            if ngramCount > minngramCount:
                combinedngram = combinedngram + str(ngramCount) + "\t" + ngramStr + "\n"
        else:
            #Only add if not found in larger match
            if combinedngram.find(ngramStr) == -1 and ngramCount > minngramCount:
                combinedngram = combinedngram + str(ngramCount) + "\t" + ngramStr + "\n"
            

print("\n" + ("-" * 75))
print("\nCommon ngrams, longest to shortest: " + "\n\n" + combinedngram)
print("-" * 75)
print("ngram search time:\t", time.time() - ngramstart, "seconds")
print("Total processing time:\t", time.time() - start, "seconds")
print("\nSource:\t" + dataURL)
print("-" * 75)

#Show response if enabled
if showresponses == 1:
    print("\n\nshowresponses = 1, displaying data...")
    curlResponseSize = sys.getsizeof(curlResponse)
    dataSize = sys.getsizeof(data)
    combinedngramSize = sys.getsizeof(combinedngram)
    #print(combinedngramSize,dataSize,curlResponseSize)
    #Don't print huge amounts of data
    if (combinedngramSize < responsesizeMax):
        print("combinedngram:\n\n" + combinedngram)
    else:
        print("combinedngram larger than max response size in config - " + str(combinedngramSize) + " bytes, max is " + str(responsesizeMax))
    if (dataSize < responsesizeMax):
        print("Data:\n\n" + data)
    else:
        print("data larger than max response size in config - " + str(dataSize) + " bytes, max is " + str(responsesizeMax))


Matching regex: "title": "(.*?)"
Before size:	114.58	KB
After size:	2.76	KB

Removing the following chars: [';', ':', '!', '*', '<', '>', '-', '"', '=']
Before size:	2.76	KB
After size:	2.75	KB

---------------------------------------------------------------------------

Common ngrams, longest to shortest: 

3	of space

---------------------------------------------------------------------------
ngram search time:	 0.011010885238647461 seconds
Total processing time:	 0.014000177383422852 seconds

Source:	https://www.reddit.com/r/space/.json?count=250
---------------------------------------------------------------------------


showresponses = 1, displaying data...
combinedngram:

3	of space

Data:

20 years ago, the compton gamma ray observatory plummeted through earth's atmosphere and splashed into the pacific ocean, ending its nearly decadelong quest to explore the highenergy cosmos.
[xpost] we are the spacex software team, ask us anything
\
a complete guide to saturn and its moons, e