In [16]:
import pandas as pd
import numpy as np
import re
from pprint import pprint
import collections
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import models
from keras import layers
import json
from nltk.corpus import stopwords

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os

In [17]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['http', 'bully', 'bullying'])
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
df = pd.read_csv('../data/english_labels_with_tweets.csv')
print(df.Type.unique())
types=['self-disclosure', 'report', 'denial', 'accusation', 'cyberbullying']

df = df.loc[df['Type'].isin(types)]
print(df.Type.unique())
print(df)

['self-disclosure' nan 'report' 'denial' 'accusation' 'cyberbullying']
['self-disclosure' 'report' 'denial' 'accusation' 'cyberbullying']
                Tweet ID    User ID Bullying_Traces?             Type  \
0     105730486382497793  322329899                y  self-disclosure   
6     102533497637437441   70412906                y           report   
7     103441158314790912  221102655                y  self-disclosure   
15    103557373079199744  207378439                y  self-disclosure   
20    101495938232750080  331380807                y  self-disclosure   
...                  ...        ...              ...              ...   
2430  102730819231166465    6872532                y           report   
2434  104568731778285569  135040936                y           denial   
2435  105430902951251969  148111013                y  self-disclosure   
2436  107395299085008896  131482264                y  self-disclosure   
2440  103657243970502656  219840796                y       

In [19]:
# Convert to list
data = df.Text.values.tolist()

#remove links
data = [re.sub(r"http\S+", "", sent) for sent in data]

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['mr. walt, stop being a bully.']


In [5]:
from nltk.tag.stanford import StanfordNERTagger

In [6]:
#Set environmental variables programmatically.
#Set the classpath to the path where the jar file is located
os.environ['CLASSPATH'] = "C:/Users/erajkovic/Documents/APT/projekt/stanford-ner-2015-04-20/stanford-ner.jar"
#Set the Stanford models to the path where the models are stored
os.environ['STANFORD_MODELS'] = 'C:/Users/erajkovic/Documents/APT/projekt/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner'
#Set the java jdk path

java_path = "C:/Program Files/Java/jdk1.8.0_161/bin/java.exe"
os.environ['JAVAHOME'] = java_path


#Set the path to the model that you would like to use
stanford_classifier  =  'C:/Users/erajkovic/Documents/APT/projekt/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz'

#Build NER tagger object
st = StanfordNERTagger(stanford_classifier, "C:/Users/erajkovic/Documents/APT/projekt/stanford-ner-2015-04-20/stanford-ner.jar")

#A sample text for NER tagging
text = 'srinivas ramanujan went to the united kingdom. There he studied at cambridge university.'

#Tag the sentence and print output
tagged = st.tag(str(text).split())
print(tagged)


[('srinivas', 'PERSON'), ('ramanujan', 'PERSON'), ('went', 'O'), ('to', 'O'), ('the', 'O'), ('united', 'LOCATION'), ('kingdom.', 'LOCATION'), ('There', 'O'), ('he', 'O'), ('studied', 'O'), ('at', 'O'), ('cambridge', 'LOCATION'), ('university.', 'O')]


In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['mr', 'walt', 'stop', 'being', 'bully']]


In [9]:
def check_ner_tag(text, word):
    tagged = st.tag(text)
    for w, tag in tagged:
        if w == word:
            if tag == 'PERSON':
                return True
            else:
                return False
    return False

def remove_ner_person(texts):
    return [[word for word in doc if not check_ner_tag(doc, word)] for doc in texts]

data_words = remove_ner_person(data_words)

print(data_words[:1])

KeyboardInterrupt: 

In [21]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['mr', 'walt', 'stop', 'being', 'bully']


In [22]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def remove_no_dict_words(texts):
    return [[word for word in doc if word in glove_dict] for doc in texts]
    
GLOVE_DIM=100
glove_file = 'C:/Users/erajkovic/Downloads/glove.twitter.27B/glove.twitter.27B.100d.txt'
glove_dict = []
NB_WORDS = 4525   # duljina file-a vocab
glove = open(glove_file, encoding="utf8")
for line in glove:
    values = line.split()
    word = values[0]
    glove_dict.append(word)
glove.close()
print(glove_dict[:3])
    

['<user>', '.', ':']


In [23]:
# Remove Stop Words
data_words_nodict = remove_no_dict_words(data_words)
data_words_nostops = remove_stopwords(data_words_nodict)
print(data_words_nostops)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = remove_stopwords(data_lemmatized)


print(data_lemmatized[:1])


[['mr', 'walt', 'stop'], ['totally', 'one', 'hand', 'retaliatory', 'nature', 'pissy', 'great', 'see', 'stick', 'smackdown'], ['yes', 'thur', 'said', 'might', 'well', 'revs', 'dunno', 'friends', 'hahaha'], ['nigga', 'used', 'real', 'herbalist', 'idk', 'wham', 'ah', 'deal'], ['bobby', 'needs', 'threw', 'disgusting'], ['please', 'help', 'stop', 'workplace'], ['better', 'anti', 'ad', 'slogans', 'hey', 'bullies', 'since', 'us', 'well', 'send', 'flowers'], ['vc', 'sofres'], ['trans', 'strong', 'heart', 'donghae', 'bullied', 'awwww', 'teukie', 'hyung', 'poor', 'nano'], [], ['kids', 'adult', 'problem', 'learn', 'role', 'adults', 'play', 'epidemic', 'childs', 'play', 'coming', 'sept'], ['cl', 'jjongie', 'angry'], ['whatch', 'video', 'help', 'take', 'apart', 'ending'], ['whoa', 'deepest', 'truth', 'dont', 'truth', 'silence', 'toxic', 'thoughts', 'words'], ['wasnt', 'bullies', 'random', 'attack', 'lol', 'dont', 'even', 'pretend'], ['love', 'stomp', 'video', 'inspirational', 'think', 'enter'], ['d

[['mr', 'walt', 'stop']]


In [24]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = TfidfModel(corpus)
corpus = tfidf[corpus]

# View
print(corpus[:1])
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus]

<gensim.interfaces.TransformedCorpus object at 0x00000184C51A4898>


[[('mr', 0.6335859932140154),
  ('stop', 0.30879506478162194),
  ('walt', 0.7093760618808077)],
 [('great', 0.25700816189994535),
  ('hand', 0.3228175067554949),
  ('nature', 0.34710577787863356),
  ('pissy', 0.38862685161104454),
  ('retaliatory', 0.38862685161104454),
  ('see', 0.1773899318685882),
  ('smackdown', 0.38862685161104454),
  ('stick', 0.3228175067554949),
  ('totally', 0.34710577787863356)],
 [('dunno', 0.4674744306628829),
  ('friend', 0.27731538174287),
  ('hahaha', 0.3675840590989703),
  ('revs', 0.4674744306628829),
  ('say', 0.23553617051989642),
  ('thur', 0.4674744306628829),
  ('well', 0.27731538174287)],
 [('deal', 0.4269922906682125),
  ('herbalist', 0.4269922906682125),
  ('idk', 0.4269922906682125),
  ('nigga', 0.35468621405318407),
  ('real', 0.26917282960977884),
  ('use', 0.26917282960977884),
  ('wham', 0.4269922906682125)],
 [('bobby', 0.6064054234924469),
  ('disgusting', 0.5037178622200957),
  ('need', 0.3532833040490017),
  ('throw', 0.503717862220095

In [25]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [26]:
# Print the Keyword in the n topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.010*"cyber" + 0.008*"mean" + 0.008*"never" + 0.007*"always" + '
  '0.006*"remember" + 0.006*"take" + 0.006*"ask" + 0.006*"read" + 0.005*"mom" '
  '+ 0.005*"day"'),
 (1,
  '0.009*"well" + 0.008*"want" + 0.008*"really" + 0.007*"video" + '
  '0.006*"advice" + 0.006*"perfect" + 0.005*"old" + 0.005*"earth" + '
  '0.005*"man" + 0.005*"like"'),
 (2,
  '0.022*"people" + 0.011*"kid" + 0.010*"time" + 0.009*"child" + 0.008*"love" '
  '+ 0.007*"abuse" + 0.006*"problem" + 0.005*"send" + 0.005*"book" + '
  '0.004*"new"'),
 (3,
  '0.025*"get" + 0.019*"see" + 0.012*"shit" + 0.010*"make" + 0.009*"hate" + '
  '0.009*"need" + 0.008*"call" + 0.008*"give" + 0.008*"friend" + '
  '0.007*"thing"'),
 (4,
  '0.015*"stop" + 0.014*"lol" + 0.013*"school" + 0.012*"know" + 0.008*"ass" + '
  '0.008*"feel" + 0.008*"go" + 0.007*"say" + 0.006*"think" + 0.006*"twitter"')]


In [27]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.250456795165178

Coherence Score:  0.6178474918448187


In [28]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [29]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
os.environ['MALLET_HOME'] = 'C:/Users/erajkovic/Downloads/mallet-2.0.8/mallet-2.0.8'
mallet_path = "C:/Users/erajkovic/Downloads/mallet-2.0.8/mallet-2.0.8/bin/mallet" # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [36]:
# Show Topics
pprint(ldamallet.show_topics(formatted=True))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  '0.167*"clap" + 0.167*"short" + 0.167*"admit" + 0.167*"fail" + '
  '0.167*"frescura" + 0.167*"pout" + 0.000*"dislike" + 0.000*"handbag" + '
  '0.000*"mag" + 0.000*"accept"'),
 (1,
  '0.143*"lol" + 0.071*"idiot" + 0.071*"prevention" + 0.071*"cute" + '
  '0.071*"leanne" + 0.071*"fool" + 0.071*"cyber" + 0.071*"ooooo" + '
  '0.071*"pulpit" + 0.071*"flush"'),
 (2,
  '0.250*"reese" + 0.250*"wooly" + 0.250*"hor" + 0.250*"oomf" + 0.000*"accept" '
  '+ 0.000*"racism" + 0.000*"handbag" + 0.000*"mag" + 0.000*"bec" + '
  '0.000*"define"'),
 (3,
  '0.400*"stop" + 0.200*"beatdown" + 0.200*"niggas" + 0.200*"jayden" + '
  '0.000*"accept" + 0.000*"racism" + 0.000*"handbag" + 0.000*"mag" + '
  '0.000*"homophobia" + 0.000*"perfect"'),
 (4,
  '0.125*"kid" + 0.125*"sofre" + 0.125*"lol" + 0.125*"praticando" + '
  '0.125*"sonrie" + 0.125*"happen" + 0.125*"lobak" + 0.125*"dfkm" + '
  '0.000*"accept" + 0.000*"define"')]

Coherence Score:  0.6882632065364931


In [37]:
#Function to Create Wordcloud
from wordcloud import WordCloud, STOPWORDS

def create_wordcloud(text):
    mask = np.array(open("../cloud.png"))
    wc = WordCloud(background_color="white",
        mask = None,
        max_words=3000,
        stopwords=stop_words,
        repeat=True)
    wc.generate(str(text))
    wc.to_file("wc.png")
    print('Word Cloud Saved Successfully')
    path="wc.png"
    display(open(path))

In [38]:
#Creating wordcloud for all tweets
create_wordcloud(df.Text.values)

Word Cloud Saved Successfully


<_io.TextIOWrapper name='wc.png' mode='r' encoding='cp1252'>

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=24, step=3)

In [None]:
# Show graph
limit=24; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)


In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
print(topic_counts)

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics