In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.manifold import TSNE
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import json
import re
import time


def display_topics(model, feature_names, no_top_words):

    for topic_idx, topic in enumerate(model.components_):
        # print (len (topic))
        print ("Topic", (topic_idx))
        print (" ", ([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print (model.components_)
    dimenstion_reduction(model.components_)

def dimenstion_reduction(features):
    x_embedd = TSNE(n_components=2).fit_transform(features)
    print (x_embedd)


data = []
final_data = []
with open('data/reddit/data_reddit.json') as f:
    for line in f:
        data = json.loads(line)
        if data["subreddit"].lower() == "beatingwomen" \
                or data["subreddit"].lower() == "fatpeoplehate" \
                or data["subreddit"].lower() == "transfags" \
                or data["subreddit"].lower() == "hamplanethatred" \
                or data["subreddit"].lower() == "neofag" \
                or data["subreddit"].lower() == "shitniggerssay" \
                or data["subreddit"].lower() == "niggers" \
                or data["subreddit"].lower() == "coontown" \
                or data["subreddit"].lower() == "physical_removal" \
                or data["subreddit"].lower() == "incels":

            #lower case the text
            #remove links
            token_data = re.sub (r"http\S+", "",data["body"].lower())
            #tokenize
            token_data = word_tokenize(token_data)
            # print (token_data)
            for word in token_data:
                if len(word)< 3:
                    token_data.remove (word)

             #lemmetize the data
            lemmatizer = WordNetLemmatizer ()
            lem = [lemmatizer.lemmatize (word) for word in token_data]
            lem = [lemmatizer.lemmatize (word, pos="v") for word in lem]
            clean_data = " ".join(lem)
            # print (clean_data)

            final_data.append (clean_data)

# print (final_data)
documents = final_data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.65, min_df=100, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.65, min_df=100, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

t1 = time.time()
# Run NMF
nmf = NMF(n_components=no_topics, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
print (time.time() - t1)
print (nmf)


# # Run LDA
t2 = time.time()
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.).fit(tf)
#
print (time.time() - t2)
no_top_words = 15
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
#
# def visualize_topics(topic_data):
#     n_sne = 1000
#

7.786864995956421
NMF(alpha=0.1, beta_loss='frobenius', init='nndsvd', l1_ratio=0.5,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)




72.73963809013367
Topic 0
  ['people', 'think', 'know', 'make', 'shit', 'want', 'good', 'thing', 'ha', 'time', 'gt', 'doe', 'work', 've', 'come']
Topic 1
  ['fuck', 'lover', 'shit', 'stupid', 'bitch', 'fat', 'idiot', 'cunt', 'retard', 'man', 'god', 'shut', 'piece', 'holy', 'dumb']
Topic 2
  ['faggot', 'lol', 'post', 'ban', 'sjw', 'gaf', 'gay', 'thread', 'beta', 'mod', 'love', 'sub', 'game', 'guy', 'little']
Topic 3
  ['wa', 'year', 'time', 'school', 'kid', 'tell', 'guy', 'saw', 'day', 'think', 'ago', 'friend', 'cop', 'shoot', 'old']
Topic 4
  ['like', 'look', 'act', 'sound', 'shit', 'guy', 'treat', 'girl', 'human', 'talk', 'feel', 'dog', 'animal', 'woman', 'smell']
Topic 5
  ['hate', 'fat', 'people', 'sub', 'god', 'bitch', 'jew', 'coontown', 'cunt', 'really', 'hat', 'racist', 'reason', 'gay', 'crime']
Topic 6
  ['white', 'trash', 'people', 'woman', 'man', 'race', 'asian', 'jew', 'men', 'kill', 'negro', 'girl', 'country', 'privilege', 'hispanic']
Topic 7
  ['black', 'people', 'crime', '