In [36]:
import pandas as pd
import gensim
from gensim import corpora


In [18]:
# !python -m spacy download en_core_web_lg
!pip3 install spacy



In [20]:
import spacy
import re
# import classla
import spacy.cli
spacy.cli.download("en_core_web_sm")
from nltk.stem import PorterStemmer,SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nlp_eng = spacy.load("en_core_web_sm", disable=['tagger', 'parser', 'ner'])
eng_stopwords = set(nlp_eng.Defaults.stop_words)
slo_stopwords = set(stopwords.words('slovene'))

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
def base_preprocessing(text):
    """
    Applies different steps of preprocessing to a text.
    Preprocessing includes:
    - remove all emoticons
    - remove non-standard lexical tokens (which are not numeric or alphabetical)
    - remove url and @name mentions
    - convert all letters to lower case

    Arguments
    ----------
    text:                   AnyStr
                            Text which should be converted by preprocessing
    Returns
    -------
    preprocessed_text:      String
                            Text which is converted by preprocessing.
    """
    EMOJI_PATTERN = re.compile(
        "(["
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "])"
    )
    text = re.sub(EMOJI_PATTERN,"",text)
    # remove (twitter) urls
    text = re.sub(r"http://t.co/[a-zA-Z0-9čČšŠžŽ]+", "", text)
    text = re.sub(r"https://t.co/[a-zA-Z0-9čČšŠžŽ]+", "", text)

    # remove all hashtags or @name Mentions (Usernames only allowed to includes characters A-Z, 0-9 and underscores)
    text = re.sub(r"[@#][a-zA-Z0-9_čČšŠžŽ]+", "", text)

    # remove non alphabetical characters
    text = re.sub(r"[^a-zA-Z0-9\sčČšŠžŽ]", "", text)

    # remove multiple white spaces
    text = re.sub(' +', ' ', text)

    # convert all letters to lower case
    text = text.lower()

    return text.strip()


In [6]:
def eng_preprocessing(text, remove_stopwords=True, do_lemmatization=True):
    """
    Applies different steps of preprocessing to a text.
    Preprocessing includes:
    - remove standard stopwords (english stopwords from spacy)
    - perform lemmatizing

    Arguments
    ----------
    text:                   AnyStr
                            Text which should be converted by preprocessing
    Returns
    -------
    preprocessed_text:      String
                            Text which is converted by preprocessing.
    """
    
    text = base_preprocessing(text)

    tokens = []

    # split text to single words
    words = word_tokenize(text)

    lemmer = WordNetLemmatizer()

    # remove stopwords and words with length 1
    for word in words:
        if not remove_stopwords or word not in eng_stopwords:
            if do_lemmatization:
                word = lemmer.lemmatize(word)
            tokens.append(word)

    # convert tokens back to text
    preprocessed_text = ' '.join([str(element) for element in tokens])
    return preprocessed_text

In [22]:
df = pd.read_csv("../data/final_data/eng/binary/data.csv")

In [45]:
df['preprocessed'] = df['Text'].apply(eng_preprocessing)

In [55]:
# preprocessed = df['preprocessed'].to_list()
df['corpus'] = df['preprocessed'].apply(lambda x : x.split(" "))
corpus  = df['corpus'].to_list()
dictionary = corpora.Dictionary(corpus)
df['corpus'] = df['corpus'].apply(lambda x: dictionary.doc2bow(x))

In [56]:
df['corpus'].head()

0                                             [(0, 1)]
1    [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1...
2    [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), ...
3    [(0, 1), (14, 1), (18, 1), (19, 1), (20, 1), (...
4    [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), ...
Name: corpus, dtype: object

In [42]:
df['corpus'].apply(lambda x : x.split(" "))

0                                                 [merkel]
1        [expect, woman, asking, men, longer, intereste...
2        [groping, people, public, wasnt, illegal, what...
3        [merkel, possible, person, charge, worse, obam...
4        [know, mean, need, pas, law, making, legal, ca...
                               ...                        
75699    [isnt, abt, mariah, carey, ok, indirecting, ur...
75700             [shes, mermaid, fat, fuckhead, gsasxagh]
75701    [u, try, u, dirty, nigga, unlike, im, educated...
75702    [fool, like, u, born, jap, momthrow, ur, bar, ...
75703    [nigger, think, hot, gmengwomen, dont, think, ...
Name: preprocessed, Length: 75704, dtype: object

In [28]:
tokens = [sentence.split(" ") for sentence in preprocessed]

In [31]:
# dictionary = corpora.Dictionary(tokens)
# corpus = [dictionary.doc2bow(text) for text in tokens]

In [57]:
corpus = df['corpus'].to_list()
corpus

[[(0, 1)],
 [(1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(0, 1),
  (14, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 3),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 2),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 2),
  (47, 1)],
 [(48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(0, 1), (35, 1), (57, 1), (58, 1), (59, 1)],
 [(14, 2),
  (22, 1),
  (35, 2),
  (43, 1),
  (44, 1),
  (51, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (

In [58]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.072*"white" + 0.027*"black" + 0.025*"jew" + 0.017*"racist" + 0.014*"race" + 0.014*"s" + 0.013*"youtube" + 0.011*"tweet" + 0.011*"old" + 0.010*"like"')
(1, '0.026*"paki" + 0.020*"school" + 0.015*"redneck" + 0.015*"child" + 0.014*"muslim" + 0.008*"christian" + 0.008*"crime" + 0.007*"rape" + 0.006*"kid" + 0.006*"ta"')
(2, '0.106*"nigger" + 0.029*"fat" + 0.023*"skank" + 0.020*"ugly" + 0.013*"shithead" + 0.013*"die" + 0.011*"aint" + 0.010*"yo" + 0.010*"n" + 0.010*"baby"')
(3, '0.060*"woman" + 0.028*"fucktard" + 0.027*"men" + 0.023*"hate" + 0.013*"man" + 0.013*"jesus" + 0.011*"teacher" + 0.007*"feminist" + 0.007*"gop" + 0.007*"horrible"')
(4, '0.022*"retarded" + 0.020*"people" + 0.018*"like" + 0.017*"dont" + 0.013*"think" + 0.012*"know" + 0.010*"want" + 0.007*"thing" + 0.007*"right" + 0.007*"thats"')
(5, '0.030*"rt" + 0.014*"trump" + 0.014*"country" + 0.009*"american" + 0.007*"america" + 0.006*"home" + 0.006*"vote" + 0.006*"state" + 0.006*"need" + 0.006*"year"')
(6, '0.107*"cunt" + 0.

In [60]:
ldamodel.get_document_topics(dictionary.doc2bow(["isnt", "abt", "mariah", "carey", "ok", "indirecting"]))

[(0, 0.03321499),
 (1, 0.033215072),
 (2, 0.033215445),
 (3, 0.033215094),
 (4, 0.03323092),
 (5, 0.033214983),
 (6, 0.7010447),
 (7, 0.03321875),
 (8, 0.033215065),
 (9, 0.03321504)]

In [67]:
df['topics'] = df['corpus'].apply(lambda x : max(ldamodel.get_document_topics(x),key=lambda x:x[1])[0])

In [68]:
df['topics'].head()

0    8
1    4
2    4
3    1
4    5
Name: topics, dtype: int64