## Setup jupyter notebook and install dependencies

```
python -m venv myenv
source myenv/bin/activate
pip install ipykernel
pip install jupyter
python -m ipykernel install --user --name=myenv
jupyter notebook --ip 0.0.0.0 --port 8888
```


In [None]:
!pip install bertopic
!pip install "nbformat>=4.2.0"
!pip install tf-keras
!pip install openpyxl
#!pip install ipywidgets
!pip install xlsxwriter
#!pip install amphi-etl 
!pip install spacy
!pip install pandas
#!python -m spacy download sv_core_news_sm
#!python -m spacy download sv_core_news_md
!python -m spacy download sv_core_news_lg

In [None]:
import pandas as pd
import glob
from miniArkivet import miniArkivet
import multi_train

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

## Parser and store

In [None]:
# miniArkivet parser
from pathlib import Path
Path("results/").mkdir(parents=True, exist_ok=True)
#df = miniArkivet(r"dummy") # 15000 rows
#df = miniArkivet(r"tests") # only few rows
df = miniArkivet(r"C:\Users\frauker\OneDrive - Chalmers\Frauke\Projects\Agnotology of medical AI\A-Media\code\2024-08-16_full-run\case-1177") #imports all txt files in the selected folder

df['alltext'] = df[['title', 'text']].apply('\n'.join, axis=1)
#df.to_excel("results/parsed_results.xlsx", sheet_name='results')
df.to_csv("results/parsed_results.csv", index=False)

# load and clean up

In [None]:
# formatted_time = "..."
df = pd.read_csv("results/parsed_results.csv")
# import re
# Optional data cleanup here
#df.text = df.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
#df.text = df.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
#df.text = df.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)

# Source code generated by Amphi
# Date: 2024-07-13 14:28:53
# Additional dependencies: xlsxwriter
#import pandas as pd

# Reading data from 2024-07-12__18-43-parsed_results.csv
csvFileInput1 = df

# Deduplicate rows
deduplicateData1 = csvFileInput1.drop_duplicates(subset=["title", "newspaper", "text"])

# Filter rows based on condition
filter1 = deduplicateData1[~deduplicateData1['alltext'].str.contains("Ai Wei", na=False)]
#filter2 = filter1[~filter1['date'].str.contains("1993", na=False)] 

#TODO: remove Eslövs AI, EAI
# Filter rows based on condition
filter3 = filter1.dropna(subset=['text', 'date']) #removes articles with empty text fields

#TODO: remove empty date
#filter2.to_excel("testresults.xlsx", engine='xlsxwriter', header=True)
  
df = filter3
#df.to_excel("results/cleaned_results.xlsx", sheet_name='results')
df.to_csv("results/cleaned_results.csv", index=False)
df

## Sentiment Analysis

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("KBLab/megatron-bert-large-swedish-cased-165k")
model = AutoModelForSequenceClassification.from_pretrained("KBLab/robust-swedish-sentiment-multiclass")
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
def sentiment(sentence):
    result = classifier(sentence)
    return (result[0]["label"],result[0]["score"])

#sentiment("En fin och glad mening.")

In [None]:
df[['title_sent_label', 'title_sent_score']] = df['title'].apply(lambda t: pd.Series(sentiment(t)))
df.to_csv("results/sent.csv", index=False)
df

## NER

In [None]:
import spacy
nlp = spacy.load("sv_core_news_lg")

In [None]:
dict_persons = dict() 
dict_locations = dict()
dict_orgs = dict()
def NER(sentence):
    doc = nlp(sentence)
    persons = []
    orgs = []
    locations = []
    for ent in doc.ents:
        if ent.label_  == "PRS":
            persons.append(ent.text)
        elif ent.label_  == "ORG":
            orgs.append(ent.text)
        elif ent.label_  == "LOC":
            locations.append(ent.text)
    # remove duplicate items
    orgs = list(set(orgs))
    persons = list(set(persons))
    locations = list(set(locations))
    # Person
    for item in persons:
        if item  in dict_persons: 
            dict_persons[item] = dict_persons[item] + 1
        else: 
            dict_persons[item] = 1

    # Org
    for item in orgs:
        if item  in dict_orgs: 
            dict_orgs[item] = dict_orgs[item] + 1
        else: 
            dict_orgs[item] = 1

    # Location
    for item in locations:
        if item  in dict_locations: 
            dict_locations[item] = dict_locations[item] + 1
        else: 
            dict_locations[item] = 1
    return  (str(orgs), str(persons), str(locations))



#sentence = "Elon Musk och Steven Hawking pratar om AI på Migrationsverket"
#print(NER(sentence))

df[['ner_orgs', 'ner_persons', 'ner_loc']] = df['alltext'].apply(lambda t: pd.Series(NER(t)))
df.to_csv("results/ner.csv", index=False)

df_persons = pd.DataFrame(dict_persons.items(), columns=['Entity', 'Count'])
df_persons.to_csv("results/person_count.csv", index=False)

df_locations = pd.DataFrame(dict_locations.items(), columns=['Entity', 'Count'])
df_locations.to_csv("results/location_count.csv", index=False)

df_orgs = pd.DataFrame(dict_orgs.items(), columns=['Entity', 'Count'])
df_orgs.to_csv("results/org_count.csv", index=False)

## Topic Modelling

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

## Default bertTopic training

In [None]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
SEstopwords = ["aderton","adertonde","adjö","aldrig","alla","allas","allt","alltid","alltså","andra","andras","annan","annat","artonde","artonn","att","av","bakom","bara","behöva","behövas","behövde","behövt","beslut","beslutat","beslutit","bland","blev","bli","blir","blivit","bort","borta","bra","bäst","bättre","båda","bådas","dag","dagar","dagarna","dagen","de","del","delen","dem","den","denna","deras","dess","dessa","det","detta","dig","din","dina","dit","ditt","dock","dom","du","där","därför","då","e","efter","eftersom","ej","elfte","eller","elva","emot","en","enkel","enkelt","enkla","enligt","ens","er","era","ers","ert","ett","ettusen","fanns","fem","femte","femtio","femtionde","femton","femtonde","fick","fin","finnas","finns","fjorton","fjortonde","fjärde","fler","flera","flesta","fram","framför","från","fyra","fyrtio","fyrtionde","få","får","fått","följande","för","före","förlåt","förra","första","genast","genom","gick","gjorde","gjort","god","goda","godare","godast","gott","gälla","gäller","gällt","gärna","gå","går","gått","gör","göra","ha","hade","haft","han","hans","har","heller","hellre","helst","helt","henne","hennes","hit","hon","honom","hundra","hundraen","hundraett","hur","här","hög","höger","högre","högst","i","ibland","icke","idag","igen","igår","imorgon","in","inför","inga","ingen","ingenting","inget","innan","inne","inom","inte","inuti","ja","jag","jo","ju","just","jämfört","kan","kanske","knappast","kom","komma","kommer","kommit","kr","kunde","kunna","kunnat","kvar","legat","ligga","ligger","lika","likställd","likställda","lilla","lite","liten","litet","länge","längre","längst","lätt","lättare","lättast","långsam","långsammare","långsammast","långsamt","långt","låt","man","med","mej","mellan","men","mer","mera","mest","mig","min","mina","mindre","minst","mitt","mittemot","mot","mycket","många","måste","möjlig","möjligen","möjligt","möjligtvis","ned","nederst","nedersta","nedre","nej","ner","ni","nio","nionde","nittio","nittionde","nitton","nittonde","nog","noll","nr","nu","nummer","när","nästa","någon","någonting","något","några","nån","nånting","nåt","nödvändig","nödvändiga","nödvändigt","nödvändigtvis","och","också","ofta","oftast","olika","olikt","om","oss","på","rakt","redan","rätt","sa","sade","sagt","samma","sedan","senare","senast","sent","sex","sextio","sextionde","sexton","sextonde","sig","sin","sina","sist","sista","siste","sitt","sitta","sju","sjunde","sjuttio","sjuttionde","sjutton","sjuttonde","själv","sjätte","ska","skall","skulle","slutligen","små","smått","snart","som","stor","stora","stort","större","störst","säga","säger","sämre","sämst","så","sådan","sådana","sådant","ta","tack","tar","tidig","tidigare","tidigast","tidigt","till","tills","tillsammans","tio","tionde","tjugo","tjugoen","tjugoett","tjugonde","tjugotre","tjugotvå","tjungo","tolfte","tolv","tre","tredje","trettio","trettionde","tretton","trettonde","två","tvåhundra","under","upp","ur","ursäkt","ut","utan","utanför","ute","va","vad","var","vara","varför","varifrån","varit","varje","varken","vars","varsågod","vart","vem","vems","verkligen","vi","vid","vidare","viktig","viktigare","viktigast","viktigt","vilka","vilkas","vilken","vilket","vill","väl","vänster","vänstra","värre","vår","våra","vårt","än","ännu","är","även","åt","åtminstone","åtta","åttio","åttionde","åttonde","över","övermorgon","överst","övre"]
vectorizer_model = CountVectorizer(stop_words=SEstopwords)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

topic_model = BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
topics, probs = topic_model.fit_transform(df.text)
topic_model.save("results/defaultBertTopic.pickle", serialization="pickle")

## Build all models (optional)

To see configuration for each model, please see `multi_train.py`

In [None]:
#df = pd.read_csv("results/ner.csv")
#docs = df['alltext'].tolist()
#multi_train.build_all_models(docs)

# Load model from file without training

In [None]:
##list available models
#from bertopic import BERTopic
#print(glob.glob('./results/*.pickle'))

In [None]:
##it might cause problems to reload the topic model, preferred use is to run everything in the jupyter notebook again
#topic_model_file = "results/defaultBertTopic.pickle"
#topic_model= BERTopic.load(topic_model_file)
#freq = topic_model.get_topic_info(); freq.head(50)

In [None]:
#topic_model.get_topic(4)

# Visualizations

In [None]:
fig = topic_model.visualize_topics(); fig

In [None]:
topic_model.visualize_documents(df['alltext'].tolist() )

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
topic_model.visualize_barchart()

In [None]:
timestamps = df.date.to_list()
alltext = df.alltext.to_list()
print(len(timestamps), len(alltext))
topics_over_time = topic_model.topics_over_time(alltext, timestamps, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, topics=[10,11,12])

In [None]:
# list of all documents and their corresponding topic
result = topic_model.get_document_info(df.text)
result = result.drop(['Representation', 'Representative_Docs' ,'Top_n_words', 'Representative_document' ], axis=1)
result.to_csv('results/topic-docs.csv', index=False)
result