In [2]:
import zstandard
import os
import json
import sys
from datetime import datetime
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()

In [3]:
file_path = "data/mbti_submissions.zst"
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
field = "subreddit"
value = "mbti"
bad_lines = 0
objs = []
# try:
for line, file_bytes_processed in read_lines_zst(file_path):
    try:
        obj = json.loads(line)
        objs.append(obj)
        created = datetime.fromtimestamp(int(obj['created_utc']))
        temp = obj[field] == value
    except (KeyError, json.JSONDecodeError) as err:
        bad_lines += 1
    file_lines += 1
    if file_lines % 100000 == 0:
        log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

# except Exception as err:
# 	log.info(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")

2021-04-23 02:00:41 : 100,000 : 0 : 30,016,175:61%
Complete : 192,593 : 0


In [13]:
import pandas as pd
df = pd.DataFrame(objs)

In [41]:
df.keys()

Index(['archived', 'author', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'brand_safe',
       'can_gild',
       ...
       'poll_data', 'upvote_ratio', 'is_created_from_ads_ui', 'retrieved_utc',
       'top_awarded_type', 'url_overridden_by_dest', 'gallery_data',
       'is_gallery', 'call_to_action', 'collections'],
      dtype='object', length=128)

In [26]:
df[df['author'] == "Hydra-Sagaria"]["title"]

127592                                Attitude Psyche Stuff
133653                                  Most immature mbti?
134100    Which MBTI is the most disconnected from world...
134271                    Which type gets bored less often?
134391    Which MBTI is the most physically active? (Int...
                                ...                        
191511    Which type when asked the question "Why?" they...
191797    Which type gets annoyed when having to assume ...
191932                    Guess which type I am most likely
192134    Which type when in a classroom they are most i...
192282    Who would get the most nervous when it is thei...
Name: title, Length: 866, dtype: object

In [30]:
df['send_replies'].value_counts()

send_replies
True     167962
False      4803
Name: count, dtype: int64

In [23]:
df[df['author'] != "[deleted]"]['author'].value_counts()

author
Hydra-Sagaria           866
Twili95                 560
virekkartgod            485
Wondering_Fairy         454
RTX2080Ti0              356
                       ... 
Equiin                    1
help-help-help-           1
just_a_normal_persom      1
albumparty                1
High__IQ__ENTJ            1
Name: count, Length: 42474, dtype: int64

In [22]:
df[df['author'] != "[deleted]"]['author_flair_text'].value_counts()

author_flair_text
INTP                                  9536
INFP                                  8407
ENTP                                  8067
INTJ                                  6310
INFJ                                  6061
                                      ... 
INTP?                                    1
[INFP] 4w3                               1
ISTP (yes, I'm sure. Stop asking.)       1
[INFP 4w5]                               1
[I][s]uck[F]at[P]enis                    1
Name: count, Length: 287, dtype: int64

- "author"
- "author_flair_css_class": label
- "title": titolo del post
- "selftext": testo del post
- "is_self": indicates whether the post is a text post (i.e. not linking outside) or not — I used this to filter out any image or url posts.
- 

In [39]:
len(objs[100100].keys())

78

In [20]:
obj['body']

"i have two friends who were in a relationship for a little while; one of them is ESTP 7w8 sp/sx, other is ENTP 7w8 sx/sp. I wasn't really good friends with them during the relationship so I don't know what they were like together but I imagine it was extremely chaotic at all times."

In [7]:
temp

True

In [8]:
log

<Logger bot (DEBUG)>