#Reddit API

In [None]:
# Install the necessary packages (Run only once)
!pip install praw

In [3]:
import praw
import json
import time

In [4]:
my_client_id = 'my_client_id'
my_client_secret = 'my_client_secret'
my_user_agent = 'my_user_agent'
my_username = 'my_username'
my_password = 'my_password'

def getMyCredentials():
  return (my_client_id, my_client_secret, my_user_agent, my_username, my_password)

In [5]:
# getMyCredentials is a function that I use to fetch my information.
# You can hard code variable values if desired
my_client_id, my_client_secret, my_user_agent, my_user, my_password = getMyCredentials()

# Create the Reddit instance
reddit = praw.Reddit(
    client_id = my_client_id, 
    client_secret = my_client_secret, 
    password = my_password,
    user_agent = my_user_agent,
    username = my_user,
    check_for_async = False # This is to prevent warning messages when running on google collab
)
reddit.read_only = True

In [6]:
# A list of all of the subreddits that we want to get posts from
subredditList = [
 'istj',
 'istp',
 'isfj',
 'isfp',
 'infj',
 'infp',
 'intj',
 'intp',
 'estp',
 'estj',
 'esfp',
 'esfj',
 'enfp',
 'enfj',
 'entp',
 'entj'
]

In [7]:
# Function to get the post titles and post bodies, given a subreddit
def getPosts(subredditName, limit = 100000, metric = 'top'):
  # Create a subreddit instance (PRAW)
  subreddit = reddit.subreddit(subredditName)

  # Based on which metric we have requested, PRAW libary pulls  
  # limit number of posts from the subreddit based on the metric
  if metric == 'hot':
    posts = subreddit.hot(time_filter="all", limit = limit)
  elif metric == 'new':
    posts = subreddit.new(time_filter="all", limit = limit)
  else:
    posts = subreddit.top(time_filter="all", limit = limit)
  
  # For all of the posts we have, we check if we have a text body of more than
  # 50 characters, if not, we disregard them
  filteredPosts = [post for post in posts if len(post.selftext) > 50]

  # For those posts with 50 characters, we get their title and body text
  filteredEntries = [(post.title + post.selftext) for post in filteredPosts]

  # TODO: Also include the comments for the analysis

  # Filtered entries list is a list of posts (represented as strings) from the
  # given subreddit
  return filteredEntries

def getPostsFromSubredditList(subredditList, limit = 100000):
  dataset = []
  labels = []

  for subredditIdx in range(16):
    time.sleep(30)

    subreddit = subredditList[subredditIdx]
    print(f"Getting posts from r/{subreddit}")

    currentEntries = getPosts(subreddit, limit)
    dataset = dataset + currentEntries

    num_labels = len(currentEntries)
    currentLabels = [subredditIdx] * num_labels
    labels = labels + currentLabels

  return dataset, labels


If we don't have the dataset, we can run:

In [11]:
dataset, labels = getPostsFromSubredditList(subredditList, limit = 100000)

data = {
    'dataset' : dataset,
    'labels' : labels
}

with open('/content/reddit_data.json', 'w') as f:
    json.dump(data, f)

If we have our dataset already, we can run:

In [8]:
with open('/content/reddit_data.json', 'r') as f:
    data = json.load(f)

dataset, labels = data['dataset'], data['labels']

#Pre-Processing Text

In [9]:
import string
import json
from typing import List
from keras.preprocessing.text import Tokenizer

In [10]:
stopwordsList = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
personalityList = [
 'istj',
 'istp',
 'isfj',
 'isfp',
 'infj',
 'infp',
 'intj',
 'intp',
 'estp',
 'estj',
 'esfp',
 'esfj',
 'enfp',
 'enfj',
 'entp',
 'entj']

stopwords = set(stopwordsList)
personalities = set(personalityList)


In [11]:
def isLegalToken(token : str) -> bool:
  flagList = [personality in token.lower() for personality in personalities]
  condition = (
      token.isalpha()
      and token not in stopwords
      and (True not in flagList)
      and len(token) > 2
  )
  return condition

def clean_post(post : str) -> List[str]:
  # Split the text into tokens (by whitespace)
  post = post.replace(".", " ")
  tokens = post.split()
  # Filter out punctuations, numbers, and also make everything lowercase
  # (https://www.delftstack.com/howto/python/python-replace-multiple-characters/#use-str.replace-to-replace-multiple-characters-in-python)
  transTable = str.maketrans(string.ascii_uppercase, string.ascii_lowercase, string.punctuation + string.digits)
  tokens = [token.translate(transTable) for token in tokens]
  # Remove illegal tokens (check isLegalToken implementation)
  tokens = [token for token in tokens if isLegalToken(token)]
  return tokens

def tokens_to_text(tokens : List[str]) -> str:
  return ' '.join(tokens)

def preprocess_post(post : str) -> str:
    tokens = clean_post(post)
    cleanText = tokens_to_text(tokens)
    return cleanText

def preprocess_all_posts(posts : List[str]) -> List[str]:
  return [preprocess_post(post) for post in posts]



If we haven't cleared our posts already, we can run:

In [7]:
processedData = {
    'dataset' : preprocess_all_posts(dataset),
    'labels' : labels
}
with open('/content/processed_reddit_data.json', 'w') as f:
    json.dump(processedData, f)

If we have, we can run:

In [12]:
with open('/content/processed_reddit_data.json', 'r') as f:
    processedData = json.load(f)

Let's see how different the processed text is compared to our original. Let's use the second element on our training set as an example.

In [13]:
processedData['dataset'][2]

'much freaking tired stereotypes usually described boring robotic emotionless sure know deal dont get started memes sure arent even created thus portrayed plain people entire universe done relationship four years hope many years come dedicated downtoearth loving person ever encountered entire life usually read incompatibility pairing guess true people wouldnt trade world maybe dont know much mbti share functions right lean lean instance come different balanced relationship stormy sea lighthouse land going talking know specially hes one know extremely funny witty come punniest puns ever straight face whole room crack laughter almost like awfully bad dad jokes actually really funny emotionless robots say well somebody might think never close cant pass façade feel like everybody else closest people access emotional expression suffer love cry take time look throught serene presence get know name say loyal loyal logistician point get trapped might try take advantage might express support wo

In [14]:
dataset[2]

'ISTJ are so much more.I\'m so freaking tired of the ISTJ stereotypes. Usually ISTJ are described as boring, robotic, emotionless. You sure know the deal. And don\'t get me started with the memes. I\'m sure most of them aren\'t even created by ISTJs themselves and thus they are portrayed as the most plain people in the entire universe. I\'m done.\n\nI have been in a relationship with a ISTJ for four years now and I hope for many years to come. He is the most dedicated, down-to-earth and loving person I ever encountered in my entire life. I am myself an INFP. I usually read about the incompatibility of ISTJ and INFP pairing, and I guess it can be true to some people, but I wouldn\'t trade my ISTJ for the world. Maybe I don\'t know much about the MBTI but we share functions, right? I can lean on my Si, he can lean on his Fi, for instance. We can come as very different, but it\'s a balanced relationship. I\'m the stormy sea, he is the lighthouse on land.\n\nI am going to be talking about 

#Create the Tokenizer

In [15]:
import random
import numpy as np
from sklearn.utils import shuffle

In [16]:
ones = ['e', 'n', 'f', 'p']

personality_to_vector = {
    'istj' : np.array([0,0,0,0]),
    'istp' : np.array([0,0,0,1]),
    'isfj' : np.array([0,0,1,0]),
    'isfp' : np.array([0,0,1,1]),
    'infj' : np.array([0,1,1,0]),
    'infp' : np.array([0,1,1,1]),
    'intj' : np.array([0,1,0,0]),
    'intp' : np.array([0,1,0,1]),
    'estp' : np.array([1,0,0,1]),
    'estj' : np.array([1,0,0,0]),
    'esfp' : np.array([1,0,1,1]),
    'esfj' : np.array([1,0,1,0]),
    'enfp' : np.array([1,1,1,1]),
    'enfj' : np.array([1,1,1,0]),
    'entp' : np.array([1,1,0,1]),
    'entj' : np.array([1,1,0,0])
}

vector_code_to_personality = {
    '0000' : 'istj',
    '0001' : 'istp',
    '0010' : 'isfj',
    '0011' : 'isfp',
    '0110' : 'infj',
    '0111' : 'infp',
    '0100' : 'intj',
    '0101' : 'intp',
    '1001' : 'estp',
    '1000' : 'estj',
    '1011' : 'esfp',
    '1010' : 'esfj',
    '1111' : 'enfp',
    '1110' : 'enfj',
    '1101' : 'entp',
    '1100' : 'entj'
}

personality_ID_to_vector = {
    0   : np.array([0,0,0,0]),
    1   : np.array([0,0,0,1]),
    2   : np.array([0,0,1,0]),
    3   : np.array([0,0,1,1]),
    4   : np.array([0,1,1,0]),
    5   : np.array([0,1,1,1]),
    6   : np.array([0,1,0,0]),
    7   : np.array([0,1,0,1]),
    8   : np.array([1,0,0,1]),
    9   : np.array([1,0,0,0]),
    10  : np.array([1,0,1,1]),
    11  : np.array([1,0,1,0]),
    12  : np.array([1,1,1,1]),
    13  : np.array([1,1,1,0]),
    14  : np.array([1,1,0,1]),
    15  : np.array([1,1,0,0])
}

In [17]:
n_features = 600

tokenizer = Tokenizer(num_words = n_features)
tokenizer.fit_on_texts(processedData['dataset'])

X = tokenizer.texts_to_matrix(processedData['dataset'], mode='freq')
Y = np.array(processedData['labels'])

X, Y = shuffle(X, Y)

m = X.shape[0]
test_count = (m // 10)

X_train, Y_train = X[:test_count], Y[:test_count]
X_test, Y_test = X[test_count:], Y[test_count:]

Y_train_4v = np.array([personality_ID_to_vector[val] for val in Y_train])
Y_test_4v = np.array([personality_ID_to_vector[val] for val in Y_test])

In [18]:
X.shape

(6814, 600)

#Create the DNN model

In [19]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.losses import BinaryCrossentropy
from keras.losses import SparseCategoricalCrossentropy
from keras.utils import np_utils


In [None]:
model_4v = Sequential([
    Dense(400, input_shape = (n_features, ), activation = 'relu'),
    Dense(200, activation = 'relu'),
    Dense(100, activation = 'relu'),
    Dense(50, activation = 'relu'),
    Dense(4, activation = 'sigmoid')
])

model_4v.compile(
    loss = BinaryCrossentropy(),
    optimizer = keras.optimizers.Adamax(),
    metrics = "accuracy"
)

fit_history = model_4v.fit(
    X_train,
    Y_train_4v,
    epochs = 50,
    batch_size = 64,
    validation_split = 0.1
)

In [None]:
model_16 = Sequential([
    Dense(400, input_shape = (n_features, ), activation = 'relu'),
    Dense(200, activation = 'relu'),
    Dense(200, activation = 'relu'),
    Dense(100, activation = 'relu'),
    Dense(16, activation = 'softmax')
])

model_16.compile(
    loss = SparseCategoricalCrossentropy(),
    optimizer = keras.optimizers.Adam(),
    metrics = "accuracy"
)

fit_history = model_16.fit(
    X_train,
    Y_train,
    epochs = 50,
    batch_size = 64,
    validation_split = 0.1
)

In [24]:
def catagory_wise_accuracy_4(model, X_test, Y_test):
  predictions = model.predict(X_test)
  outcomes = (predictions > 0.5).astype(int)

  correct = (outcomes == Y_test)
  correct = correct.astype(int)

  m = correct.shape[0]
  correctCnt = np.sum(correct, axis = 0)

  accuracy = correctCnt / m

  print(accuracy)
  

In [25]:
def catagory_wise_accuracy_16(model, X_test, Y_test):
  probabilities = model.predict(X_test)
  predictions = np.argmax(probabilities, axis = 1)
  outcomes = np.array([personality_ID_to_vector[p] for p in predictions])

  correct = (outcomes == Y_test)
  correct = correct.astype(int)

  m = correct.shape[0]
  correctCnt = np.sum(correct, axis = 0)

  accuracy = correctCnt / m

  print(accuracy)
  

In [29]:
catagory_wise_accuracy_4(model_4v, X_test, Y_test_4v)

[0.56497636 0.65970977 0.53937714 0.6339475 ]


In [30]:
catagory_wise_accuracy_16(model_16, X_test, Y_test_4v)

[0.56807435 0.64813305 0.51443013 0.59481494]


In [54]:
def getAccuracy_4v(X_test, Y_test):
  predictions = model_4v.predict(X_test)
  outcomes = (predictions > 0.5).astype(int)

  correct = (outcomes == Y_test)
  correct = correct.astype(int)

  m = correct.shape[0]

  correctCnt = np.sum(correct, axis = 1)
  fullMatch = (correctCnt == 4).astype(int)

  fullMatchCnt = np.sum(fullMatch, axis = 0)
  accuracy = fullMatchCnt / m

  return accuracy


In [55]:
getAccuracy_4v(X_test, Y_test_4v)

0.13109408120006522

In [56]:
model_16.evaluate(X_test, Y_test)



[8.862643241882324, 0.13044187426567078]

#Predict Writing

In [57]:
def predict_writing_from_text(post):
  data = preprocess_post(post)

  X_predict = tokenizer.texts_to_matrix([data], mode='freq')
  outcome = model_4v.predict(X_predict)

  outcomeLabel = outcome > 0.5
  outcomeLabel = outcomeLabel.astype(int)[0]
  outcomeList = [str(label) for label in outcomeLabel]
  print(outcomeList)
  outcomeStr = ''.join(outcomeList)

  personality = vector_code_to_personality[outcomeStr]

  print(f'You are {personality.upper()}')
  print(f'With the following probabilities')
  print(outcome[0])

In [58]:
# Trial writing taken from Stephen King's It.
trial = "Maybe there aren't any such things as good friends or bad friends - maybe there are just friends, people who stand by you when you're hurt and who help you feel not so lonely. Maybe they're always worth being scared for, and hoping for, and living for. Maybe worth dying for too, if that's what has to be. No good friends. No bad friends. Only people you want, need to be with; people who build their houses in your heart."
predict_writing_from_text(trial)

['1', '0', '1', '1']
You are ESFP
With the following probabilities
[9.3774706e-01 1.0464571e-04 5.5766761e-01 8.2144946e-01]
