In [1]:
#pip install nltk
#pip install spacy
#python -m spacy download da_core_news_sm

#pip install gensim
#pip install pyLDAvis

import json
import pandas as pd
import spacy
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
import codecs
import sys


import pathlib
from pathlib import Path
import os

import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.corpora import Dictionary

import pyLDAvis.gensim_models
import pyLDAvis.sklearn
#pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
def change_letter(dataset: pd.DataFrame) -> list:
    new_dataset = [re.sub('ø', 'oe', text) for text in dataset]
    new_dataset = [re.sub('æ', 'ae', text) for text in new_dataset]
    new_dataset = [re.sub('å', 'aa', text) for text in new_dataset]
    new_dataset = [re.sub('Ø', 'oe', text) for text in new_dataset]
    new_dataset = [re.sub('Æ', 'ae', text) for text in new_dataset]
    new_dataset = [re.sub('Å', 'aa', text) for text in new_dataset]
    new_dataset = [re.sub('ü', 'ue', text) for text in new_dataset]
    new_dataset = [re.sub('Ü', 'ue', text) for text in new_dataset]
    new_dataset = [re.sub('ä', 'ae', text) for text in new_dataset]
    new_dataset = [re.sub('Ä', 'ae', text) for text in new_dataset]
    new_dataset = [re.sub('ö', 'oe', text) for text in new_dataset]
    new_dataset = [re.sub('Ö', 'oe', text) for text in new_dataset]
    
    return new_dataset



In [3]:
def get_clean_text(dataset: list, stops_da, stops_en) -> list:
    no_urls = [re.sub(r"http\S+", "", text) for text in dataset] 
    only_letters = [re.sub(r"(#[A-Za-z]+)|(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", ' ' , text) for text in no_urls] 
    only_letters = [text.replace('\n', ' ') for text in only_letters]
    lowercased_str = [text.lower().split() for text in only_letters] 
    no_stopwords = [[w for w in text if not w in stops_da] for text in lowercased_str] 
    no_stopwords = [[w for w in text if not w in stops_en] for text in no_stopwords] 
    cleaned_text = [" ".join(text) for text in no_stopwords] 
    return cleaned_text


In [4]:
def tokenize_text(data):
    tokens = [word_tokenize(text) for text in data] 
    return tokens


In [5]:
def lemmatize_text(sent):
    lemmas = [x.lemma_ for x in nlp(sent)]
    return lemmas

In [6]:
def lemmatize_posts(tokenized_data: list):
    lemmas = []
    for post in tokenized_data:
        lemma = [lemmatize_text(x) for x in post]
        lemmas.append([item for sublist in lemma for item in sublist])
    return lemmas

In [7]:
def decode(path: str) -> list:

    f = codecs.open(path, errors = 'ignore', encoding = 'utf-8')
    data = f.read()
    data = data.encode('latin1').decode('utf8')
    data = data.splitlines()  
    return data 

In [8]:
def remove_mentions(dataset: list):
    new_dataset = []
    for i in dataset:
        if i.startswith('@'):
          i = i.replace(i, "")
        new_dataset.append(i)
    return new_dataset

In [9]:
decoded_data = decode('data/JSON_files/prepared_txt/all_data.txt')

print(len(decoded_data))

27483


In [10]:
new_data = remove_mentions(decoded_data)

print(len(new_data))

27483


In [11]:
without_empty_strings = [string for string in new_data if string != ""]

print(len(without_empty_strings))

27277


In [12]:
df = pd.DataFrame(without_empty_strings, columns=['row_posts'])

In [13]:
nlp = spacy.load("da_core_news_sm")
stopwords_da = stopwords.words("danish")  
stopwords_da.extend(['paa', 'saa', 'vaere',  'rt', 'ogsaa', 'faa', 'faar', 'nok', 'mt', 'gt'])

In [14]:
nlp1 = spacy.load("en_core_web_sm")
stopwords_en = stopwords.words("english")

In [15]:
fb_posts = change_letter(df['row_posts'])

In [16]:
cleaned_posts = get_clean_text(fb_posts, stops_da = stopwords_da, stops_en = stopwords_en)

In [17]:
remove_empty_strings = [string for string in cleaned_posts if string != ""]

In [18]:
tokenized_posts = tokenize_text(remove_empty_strings)

In [19]:
lemmatized_posts = lemmatize_posts(tokenized_posts)

In [20]:
df1 = pd.DataFrame(remove_empty_strings , columns=['cleaned_posts'])

df1['tokenized_posts'] = tokenized_posts
df1['lemmatized_posts'] = lemmatized_posts


In [21]:
df1.to_pickle('data/all_posts_df.pkl')

In [2]:
#df1 = pd.read_pickle('data/all_posts_df.pkl')

In [3]:
bigram = gensim.models.Phrases(df1['lemmatized_posts'])
posts_bigrams = [bigram[line] for line in df1['lemmatized_posts']]


In [4]:
dictionary = Dictionary(posts_bigrams)
corpus = [dictionary.doc2bow(text) for text in posts_bigrams]


In [17]:
ldamodel = LdaModel(corpus = corpus, 
                    num_topics =  6, 
                    id2word = dictionary,
                    update_every = 1,
                    passes = 10,
                    per_word_topics = True)

In [18]:
pyLDAvis.enable_notebook()
model =pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(model)


In [19]:
pyLDAvis.save_html(model, 'data/JSON_files/models/all_posts.html')