In [1]:
# Data Manipulation
import numpy as np
import pandas as pd
import os

# Data Visualisation
import matplotlib.pyplot as plt
# Pipeline and Column Transformers
from sklearn import set_config

# Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Cross Validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# Unsupervised Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# STATISTICS
from statsmodels.graphics.gofplots import qqplot

# Text Processing
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# NLTK Downloads
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set pandas display option
pd.set_option('display.max_columns', None)

# Set sklearn display configuration
set_config(display = "diagram")

# Custom Transformers and Model Building
from sklearn.base import BaseEstimator, TransformerMixin

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aryavachin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aryavachin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aryavachin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/aryavachin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
categorical_columns = ['CourseType', 'Gender', 'Ethnicity']
datetime_columns = ['AgeAtCourse']
non_textual = categorical_columns + datetime_columns

In [4]:
import sys
sys.path.append('/home/aryavachin/code/cipobt/breathWorks')
from breathworks.utils import get_data

In [5]:
from breathworks.clustering.preprocessing import build_preprocessor, simple_preprocessor_with_topics
from breathworks.clustering.cleaning import clean_data, clean_textual_columns
from breathworks.clustering.LDA import splitting_into_topics, lda_visual
from breathworks.clustering.plots import corr_plot, plot_clusters, plot_clusters_2d, plot_clusters_3d
from breathworks.clustering.clustering import label_dataframe, fit_kmeans_and_label, plot_lda
from breathworks.clustering.config import drop_columns, textual_columns, categorical_columns, datetime_columns, to_drop, topics_per_column, column_pairs

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)
  np.bool8: (False, True),


In [None]:
# # Fetch and clean data
# dataframe = get_data()
# processed_data = clean_data(dataframe,drop_columns)
# df_transformed = clean_textual_columns(processed_data, textual_columns)

# # # Apply filters
# # df_filtered = df_transformed[(df_transformed['Gender'] == 'Male') &
# #                              (df_transformed['CourseType'].isin(['OMfH','OMfH'])) &
# #                              (df_transformed['Ethnicity'] == 'White')]

# # Apply the transformations for LDA
# df_transformed = df_transformed.drop(columns=to_drop)
# df_split = splitting_into_topics(df_transformed,topics_per_column,textual_columns)
# preprocessor = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
# df_LDA = preprocessor.fit_transform(df_split)

# # final df with correct column names
# transformed_columns = preprocessor.get_feature_names_out()
# df_final = pd.DataFrame(df_LDA, columns=transformed_columns)
# df_final = df_final.apply(pd.to_numeric)

# # df_2d = df_final[[col1b,col2a]]
# # labelling = fit_kmeans_and_label(df_2d,4)
# # label_dataframe(df_2d, labelling)

# # print the clusters with their labels
# plot_lda(df_final,column_pairs)


In [6]:
# get the data
dataframe = get_data()



In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3242 entries, 0 to 3241
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CourseDate       3242 non-null   object
 1   CourseType       3242 non-null   object
 2   Gender           3242 non-null   object
 3   Ethnicity        3242 non-null   object
 4   AgeAtCourse      3242 non-null   int64 
 5   CustomerPurpose  3242 non-null   object
dtypes: int64(1), object(5)
memory usage: 152.1+ KB


In [7]:
df_transformed = dataframe

df_transformed.head(3)

Unnamed: 0,CourseDate,CourseType,Gender,Ethnicity,AgeAtCourse,CustomerPurpose
0,2017-03-12,IMfS,Male,White,21,I would like to learn more and practice mindfu...
1,2017-03-12,IMfS,Male,White,22,I would like to understand mindfulness to a gr...
2,2017-03-12,IMfS,Male,White,22,To establish a greater understanding of mindfu...


In [9]:
df_transformed.isnull().sum()

CourseDate         0
CourseType         0
Gender             0
Ethnicity          0
AgeAtCourse        0
CustomerPurpose    0
dtype: int64

In [8]:
textual_columns = ['CustomerPurpose']

In [9]:
# filter the df
in_person = ['IMfH', 'IMfS', 'I5DMfH']
online = ['OMfH', 'OMfS']
all_course_types = in_person + online
to_drop = ['Gender', 'CourseType', 'Ethnicity']

In [10]:
categorical_columns = [column for column in categorical_columns if column not in to_drop]

In [11]:
topics_per_column = {
    'CustomerPurpose': 5
}

In [12]:
# Apply the transformations for LDA
df_transformed = df_transformed.drop(columns=to_drop, errors='ignore')

In [13]:
# # Assuming 'splitting_into_topics' and 'build_preprocessor' are predefined functions
# df_split, lda_details = splitting_into_topics(df_transformed, topics_per_column, textual_columns)
# preprocessor = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
# df_LDA = preprocessor.fit_transform(df_split)

df_split, lda_details = splitting_into_topics(df_transformed, topics_per_column, textual_columns)

# Update textual_columns to reflect the new LDA topic columns
new_textual_columns = []
for text_column in textual_columns:
    num_topics = topics_per_column[text_column]
    new_textual_columns.extend([f'{text_column}_Topic{i}' for i in range(num_topics)])

# Now new_textual_columns contains the new column names generated from LDA

CustomerPurpose - Topic 0:
[('qualify for', 1.804751341941321), ('to qualify', 1.684866080060762), ('qualify', 1.6848660785126062), ('family bereavement', 1.2765949466909667), ('cease', 1.2082717707749298), ('bereavement', 1.1766814202977445), ('better work', 1.0340927141483693), ('consciousness', 0.9827397918094816), ('suggest', 0.9145874338024798), ('automatic', 0.8773522078824001)]
CustomerPurpose - Topic 1:
[('resilience', 1.2899200693870883), ('curiosity', 1.2010574085168428), ('ssnhl', 1.0836057625395716), ('provided', 1.0130845413182068), ('from friend', 1.009113597302867), ('and happiness', 1.004846506790354), ('anxiety only', 0.9995474907611517), ('msbr', 0.9689400591967874), ('msbr course', 0.9689400591967823), ('glasses', 0.9654248890031432)]
CustomerPurpose - Topic 2:
[('and', 131.62086759625828), ('to', 120.25150906920555), ('the', 93.38864629992732), ('have', 84.76912241635966), ('my', 79.72959679732874), ('in', 71.14869673009154), ('of', 67.70279735477256), ('pain', 62.6

In [17]:
assert 'CourseType' not in df_split.columns, "'CourseType' still exists in df_split"
assert 'CourseType' not in categorical_columns, "'CourseType' still exists in categorical_columns"

In [28]:
dataframe

Unnamed: 0,CourseDate,CourseType,Gender,Ethnicity,AgeAtCourse,CustomerPurpose
0,2017-03-12,IMfS,Male,White,21,I would like to learn more and practice mindfu...
1,2017-03-12,IMfS,Male,White,22,I would like to understand mindfulness to a gr...
2,2017-03-12,IMfS,Male,White,22,To establish a greater understanding of mindfu...
3,2024-02-06,OMfH,Male,White,22,To help me manage the severe ibs pain I have s...
4,2019-04-27,IMfS,Male,Other,23,Recently exited a long term relationship ; as ...
...,...,...,...,...,...,...
3237,2019-01-14,OMfH,Prefer not to say,White,48,Partly personally and professionally . Persona...
3238,2022-05-03,OMfH,Prefer not to say,Other,51,"I have chronic pain , which I 'd like to manag..."
3239,2021-10-05,OMfH,Prefer not to say,Asian,52,asdfasdf fdbx I have experienced persistent pa...
3240,2016-09-05,OMfH,Prefer not to say,White,53,fds


In [33]:
preprocessor = build_preprocessor()
preprocessor
# df_LDA = preprocessor.fit_transform(df_split)

In [84]:
dataframe

Unnamed: 0,CourseDate,CourseType,Gender,Ethnicity,AgeAtCourse,CustomerPurpose
0,2017-03-12,IMfS,Male,White,21,I would like to learn more and practice mindfu...
1,2017-03-12,IMfS,Male,White,22,I would like to understand mindfulness to a gr...
2,2017-03-12,IMfS,Male,White,22,To establish a greater understanding of mindfu...
3,2024-02-06,OMfH,Male,White,22,To help me manage the severe ibs pain I have s...
4,2019-04-27,IMfS,Male,Other,23,Recently exited a long term relationship ; as ...
...,...,...,...,...,...,...
3237,2019-01-14,OMfH,Prefer not to say,White,48,Partly personally and professionally . Persona...
3238,2022-05-03,OMfH,Prefer not to say,Other,51,"I have chronic pain , which I 'd like to manag..."
3239,2021-10-05,OMfH,Prefer not to say,Asian,52,asdfasdf fdbx I have experienced persistent pa...
3240,2016-09-05,OMfH,Prefer not to say,White,53,fds


In [31]:
topic_labelled_df, topic_only_df = simple_preprocessor_with_topics(dataframe, 'CustomerPurpose', 3)

topic_distribution topic_label
0    45.589143
1    39.543492
2    14.867366
Name: count, dtype: float64


In [16]:
topic_labelled_df

Unnamed: 0_level_0,0,1,2,3,4,topic_label
CustomerPurpose,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
learn practice mindfulness study briefly year,0.056396,0.054317,0.057577,0.774499,0.057211,3
understand mindfulness great extend use idea technique see situation objectively possible,0.803508,0.048228,0.049993,0.048539,0.049732,0
establish great understand mindfulness order good equip use technique idea,0.042660,0.041835,0.830183,0.042059,0.043263,2
help manage severe ibs pain suffer year irritable bowel syndrome anxiety depression panic attack adhd experience persistent pain last least last month anxiety depression,0.871835,0.031636,0.032124,0.032317,0.032087,0
recently exit long term relationship result experience intense clarity mind acute awareness miss significantly great attention connection emotion aspiration consciousness awareness self others attend mindfulness session weekly basis university student learn body scan cultivate love kindness calm mind focus breath sound touch eye open shut deepen meditative practice deeply personally important grow person spiritually mentally emotionally please consider acceptance application thank,0.910920,0.021976,0.022360,0.022058,0.022687,0
...,...,...,...,...,...,...
partly personally professionally personally help deal stressful situation professionally become teacher breathworks future pre requisite course,0.038850,0.038091,0.844136,0.038195,0.040727,2
chronic pain manage along learn pace activity rheumatoid arthritis degenerative disc disease tinnitus experience persistent pain last least last month n experience acute debilitate depression mental health condition,0.397907,0.029659,0.511797,0.030411,0.030227,2
fdbx experience persistent pain last least last month,0.057023,0.055668,0.056576,0.773964,0.056769,3
fds,0.200000,0.200000,0.200000,0.200000,0.200000,0


In [32]:
pd.set_option('display.max_colwidth', 500)
topic_only_df

Unnamed: 0,Distribution,Keywords
0,45.589143,'mindfulness''course''teacher''train''practice''teacher train''breathworks''meditation''teach''stress'
1,39.543492,'pain''chronic''condition''chronic pain''pain condition''identify''currently''identify chronic''currently identify''experience'
2,14.867366,'stress''work''anxiety''mindfulness''life''depression''stress work''recommend''help''job'


In [33]:
# only if you want to save this
topic_only_df.to_csv('maybe_3.csv')

In [34]:
topic_labelled_df.to_csv('maybe_3_topic_label.csv')

In [14]:
three_avatars = pd.read_csv('maybe_3_topic_label.csv')

In [15]:
three_avatars_topic_only_df = pd.read_csv('maybe_3.csv')

In [16]:
three_avatars

Unnamed: 0,CustomerPurpose,0,1,2,topic_label
0,learn practice mindfulness study briefly,0.656668,0.098584,0.244748,0
1,understand mindfulness great extend use idea t...,0.818987,0.092523,0.088489,0
2,establish great understand mindfulness order g...,0.527297,0.082640,0.390063,0
3,help manage severe ibs pain suffer irritable b...,0.056013,0.886111,0.057876,1
4,recently exit long term relationship result ex...,0.895908,0.063063,0.041029,0
...,...,...,...,...,...
3237,partly personally professionally personally he...,0.549413,0.070180,0.380407,0
3238,chronic pain manage along learn pace activity ...,0.050209,0.899765,0.050026,1
3239,fdbx experience persistent pain last least las...,0.093855,0.812967,0.093178,1
3240,fds,0.333333,0.333333,0.333333,0


In [17]:
three_avatars_topic_only_df

Unnamed: 0.1,Unnamed: 0,Distribution,Keywords
0,0,45.589143,'mindfulness''course''teacher''train''practice...
1,1,39.543492,'pain''chronic''condition''chronic pain''pain ...
2,2,14.867366,'stress''work''anxiety''mindfulness''life''dep...


## bertopic

In [35]:
!pip install -U bertopic
!pip install -U safetensors

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting numba>=0.51.2 (from umap-learn>=0.5.0->bertopic)
  Downloading numba-0.59.1-cp310-cp310-manylinux2014_x

Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pynndescent-0.5.11-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.8 MB)
Building wheels for collected packages: hdbscan, umap-learn
  Building wheel for hdbscan (pyproject.toml) ... [

In [18]:
from bertopic import BERTopic
topic_model = BERTopic.load("MaartenGr/BERTopic_ArXiv")

topic_model.get_topic_info()

2024-03-26 18:59:12.939276: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-26 18:59:14.012381: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-26 18:59:14.273660: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-26 18:59:14.273702: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

Unnamed: 0,Topic,Count,Name,Representation,POS,KeyBERTInspired,MMR,KeyBERT + MMR,OpenAI_Label,OpenAI_Summary,Representative_Docs
0,-1,14247,-1_language_models_model_data,"[language, models, model, data, based, tasks, ...","[language, models, model, data, tasks, text, t...","[language processing, language models, embeddi...","[language, models, model, data, based, tasks, ...","[language processing, language models, embeddi...",[Advancements in Multilingual Language Models ...,[Pre-trained Language Models and Embeddings fo...,
1,0,1833,0_dialogue_dialog_response_responses,"[dialogue, dialog, response, responses, intent...","[dialogue, dialog, response, responses, intent...","[task oriented dialogue, dialogue systems, ori...","[dialogue, dialog, response, responses, intent...","[task oriented dialogue, dialogue systems, ori...",[Challenges and Approaches in Developing Task-...,[Task-oriented dialogue systems and their comp...,
2,1,1369,1_speech_asr_speech recognition_recognition,"[speech, asr, speech recognition, recognition,...","[speech, recognition, acoustic, automatic spee...","[speech recognition asr, automatic speech, spe...","[speech, asr, speech recognition, recognition,...","[speech recognition asr, automatic speech, spe...",[Automatic Speech Recognition Systems for Mult...,"[Speech recognition and transcription, includi...",
3,2,1109,2_tuning_tasks_prompt_models,"[tuning, tasks, prompt, models, language, lang...","[tuning, tasks, prompt, models, language, trai...","[pre trained language, trained language models...","[tuning, tasks, prompt, models, language, lang...","[pre trained language, trained language models...",[Parameter-efficient fine-tuning of language m...,[Pre-trained language models and parameter-eff...,
4,3,893,3_summarization_summaries_summary_abstractive,"[summarization, summaries, summary, abstractiv...","[summarization, summaries, summary, abstractiv...","[summarization models, summarization model, ab...","[summarization, summaries, summary, abstractiv...","[summarization models, summarization model, ab...",[Challenges in Abstractive Text Summarization ...,[Text Summarization Models and Systems\n\nThe ...,
...,...,...,...,...,...,...,...,...,...,...,...
102,101,25,101_coherence_discourse_discourse coherence_co...,"[coherence, discourse, discourse coherence, co...","[coherence, discourse, text, paragraph, models...","[discourse coherence, coherent text, coherence...","[coherence, discourse, discourse coherence, co...","[discourse coherence, coherent text, coherence...",[Coherence modeling in written and spoken disc...,[Modeling and Understanding Discourse Coherenc...,
103,102,25,102_pos_taggers_tagging_tagger,"[pos, taggers, tagging, tagger, pos tagging, t...","[taggers, tagging, tagger, tags, tag, speech, ...","[speech tagging, speech pos tagging, tagged co...","[pos, taggers, tagging, tagger, pos tagging, t...","[speech tagging, speech pos tagging, tagged co...",[Challenges and Approaches in POS Tagging for ...,[This topic is focused on the importance of pa...,
104,103,24,103_drug_social_social media_media,"[drug, social, social media, media, health, ad...","[drug, social, social media, media, health, ad...","[topic modeling, social media data, corpus, me...","[drug, social, social media, media, health, ad...","[topic modeling, social media data, corpus, me...",[Social Media and Drug Safety in Pharmacovigil...,[This topic revolves around the use of social ...,
105,104,21,104_gender_translation_bias_gender bias,"[gender, translation, bias, gender bias, mt, m...","[gender, translation, bias, grammatical gender...","[machine translation, neural machine translati...","[gender, translation, bias, gender bias, mt, m...","[machine translation, neural machine translati...",[Gender Bias in Machine Translation and Gender...,[This topic discusses gender-related issues in...,


In [20]:
!pip install cuml

Collecting cuml
  Downloading cuml-0.6.1.post1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: cuml
  Building wheel for cuml (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[45 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Please avoid running ``setup.py`` directly.
  [31m   [0m         Instead, use pypa/build, pypa/installer or other
  [31m   [0m         standards-based tools.
  [31m   [0m 
  [31m   [0m         See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
  [31m   [0m         ************************************

In [19]:
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance, OpenAI



ModuleNotFoundError: No module named 'cuml'

In [None]:

# Prepare sub-models
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20, verbose=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)

# Summarization with ChatGPT
summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]
In this topic, the following documents are a small but representative subset of all documents in the topic:
[DOCUMENTS]

Based on the information above, please give a description of this topic in the following format:
topic: <description>
"""
summarization_model = OpenAI(model="gpt-3.5-turbo", chat=True, prompt=summarization_prompt, nr_docs=5, exponential_backoff=True, diversity=0.1)

# Representation models
representation_models = {
    "POS": PartOfSpeech("en_core_web_lg"),
    "KeyBERTInspired": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
    "KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
    "OpenAI_Label": OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, diversity=0.1),
    "OpenAI_Summary": [KeyBERTInspired(), summarization_model],
}

# Fit BERTopic
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_models,
        verbose=True
).fit(docs)


In [None]:
# # create v2
# phrase_to_exclude = "No, currently I don't identify as having a chronic pain condition."

# # Create a mask for rows where 'personalhistory' column's value is exactly the phrase_to_exclude
# mask = df_transformed['PersonalHistory'] != phrase_to_exclude

# # Apply the mask to filter out the rows
# df_transformed_filtered = df_transformed[mask]

# # Display the information and the first 3 rows of the filtered DataFrame
# print(df_transformed_filtered.info())
# df_transformed_filtered.head(3)

In [None]:
# # Apply the transformations for LDA for v2
# df_transformed_filtered = df_transformed_filtered.drop(columns=to_drop, errors='ignore')

# df_split_2, lda_details_2 = splitting_into_topics(df_transformed_filtered,topics_per_column,textual_columns)
# preprocessor_2 = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
# df_LDA_2 = preprocessor_2.fit_transform(df_split_2)

In [None]:
# transformed_columns_2 = preprocessor_2.get_feature_names_out()
# df_final_2 = pd.DataFrame(df_LDA_2, columns=transformed_columns_2)
# df_final_2 = df_final_2.apply(pd.to_numeric)

# print(df_final_2.info())
# df_final_2.head(3)

In [None]:
lda_ph =lda_details['PersonalHistory']['lda']
x_ph = lda_details['PersonalHistory']['X']
vect_ph = lda_details['PersonalHistory']['vect']

lda_visual(lda_ph, x_ph, vect_ph)

In [None]:
lda_m =lda_details['Motivation']['lda']
x_m = lda_details['Motivation']['X']
vect_m = lda_details['Motivation']['vect']

lda_visual(lda_m, x_m, vect_m)

In [None]:
transformed_columns = preprocessor.get_feature_names_out()
df_final = pd.DataFrame(df_LDA, columns=transformed_columns)
df_final = df_final.apply(pd.to_numeric)

print(df_final.info())
df_final.head(3)

In [None]:
# corrolation stuff
# df_new = remove_low_variance_features(df_final)
# df_new_2 = remove_high_correlation_features(df_new)

# corr_df = df_final.corr()
# for idx, col in corr_df.iterrows():
#     if abs(col) >= 0.25 :
#         print(col)

# corr_plot(df_final)

# # PCA transform
# df_proj, labels = transform_data(df_final, 4, 2)

# print(df_proj.info())
# df_proj.head(3)

In [None]:
# plot_clusters(df_proj, labels)

In [None]:
col1a='remainder__PersonalHistory_Topic0'
col2a='remainder__Motivation_Topic0'
col1b='remainder__PersonalHistory_Topic1'
col2b='remainder__Motivation_Topic1'
col1c='remainder__PersonalHistory_Topic2'
col2c='remainder__Motivation_Topic2'

In [None]:
df_3d = df_final[[col1a,col2a,col2b]]
labelling_3d = fit_kmeans_and_label(df_3d,4)
label_dataframe(df_3d, labelling_3d)
print(df_3d.info())
df_3d.head(3)

In [None]:
# pd.concat([df_3d,df_final[['PersonalHistory']]])

In [None]:
plot_clusters_3d(df_3d,labelling_3d)

In [None]:
# df_2d = df_final[[col1b,col2a]]
# labelling = fit_kmeans_and_label(df_2d,4)
# label_dataframe(df_2d, labelling)
# print(df_2d.info())
# df_2d.head(3)

In [None]:
# plot_clusters_2d(df_2d,labelling)

In [None]:
column_pairs = [
    (col1a, col2a),
    (col1a, col2b),
    # (col1a, col2c),
    (col1b, col2a),
    (col1b, col2b),
    # (col1b, col2c),
    # (col1c, col2a),
    # (col1c, col2b),
    # (col1c, col2c),
]

In [None]:
plot_lda(df_final,column_pairs)

In [None]:
# plot_lda(df_final_2,column_pairs)

In [None]:
avatars = {}
df_labelled = pd.concat([df_transformed,pd.Series(labelling)],axis=1).rename(columns={0:"label"})

for numero_cluster in np.unique(labelling):
    avatars[numero_cluster] = df_labelled[df_labelled.label == numero_cluster]

for key,value in avatars.items():
    print("-"*50)
    print(f"Here are some people fitting into Avatar {key}")
    print("-"*50)
    display(value.sample(10))

In [None]:
sys.exit()

In [None]:
# df['Location'] = df['Location'].apply(clean_text)

In [None]:
# def get_location_category(location):
#     if 'manchester' in location:
#         return 'Manchester'
#     elif 'liverpool' in location or 'merseyside' in location:
#         return 'Liverpool'
#     elif 'london' in location:
#         return 'London'
#     elif 'united states' in location or 'utah' in location:
#         return 'United States'
#     elif 'denmark' in location or 'croatia' in location or 'poland' in location or 'norway' in location or 'germany' in location or 'barcelona' in location:
#         return 'EUR'
#     elif 'australia' in location:
#         return 'Australia'
#     elif 'india' in location or 'maldives' in location:
#         return 'SAsia'
#     elif 'uruguay' in location:
#         return 'SAmerica'
#     elif 'united kingdom' in location:
#         return 'UK'
#     else:
#         return 'England'

# df_drop['Location_Category'] = df_drop['Location'].apply(get_location_category)
# df_drop.Location_Category.value_counts()


In [None]:
# class TextCleaner(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         cleaned_data = X.applymap(self.clean_text)
#         return cleaned_data

#     def clean_text(self, text):
#         text = str(text)
#         for punctuation in string.punctuation:
#             text = text.replace(punctuation, ' ')  # Remove Punctuation
#         lowercased = text.lower()  # Lower Case
#         tokenized = word_tokenize(lowercased)  # Tokenize
#         words_only = [word for word in tokenized if word.isalpha()]  # Remove numbers

#         stop_words = set(stopwords.words('english'))
#         stop_words.update(['yes','none','nan'])

#         without_stopwords = [word for word in words_only if not word in stop_words]  # Remove Stop Words
#         lemma = WordNetLemmatizer()  # Initiate Lemmatizer
#         lemmatized = [lemma.lemmatize(word) for word in without_stopwords]  # Lemmatize
#         cleaned = ' '.join(lemmatized)  # Join back to a string
#         return cleaned

In [None]:
# pca = PCA()
# pca.fit(df_num)
# threhsold_pca = 4
# with plt.style.context('seaborn-deep'):
#     # figsize
#     plt.figure(figsize=(10,6))
#     # getting axes
#     ax = plt.gca()
#     # plotting
#     explained_variance_ratio_cumulated = np.cumsum(pca.explained_variance_ratio_)
#     x_axis_ticks = np.arange(1,explained_variance_ratio_cumulated.shape[0]+1)
#     ax.plot(x_axis_ticks,explained_variance_ratio_cumulated,label="cumulated variance ratio",color="purple",linestyle=":",marker="D",markersize=10)
#     # customizing
#     ax.set_xlabel('Number of Principal Components')
#     ax.set_ylabel('% cumulated explained variance')
#     ax.legend(loc="upper left")
#     ax.set_title('The Elbow Method')
#     ax.set_xticks(x_axis_ticks)
#     ax.scatter(threhsold_pca,explained_variance_ratio_cumulated[threhsold_pca-1],c='blue',s=400)
#     ax.grid(axis="x",linewidth=0.5)
#     ax.grid(axis="y",linewidth=0.5)

In [None]:
# fig_scaled = px.scatter_3d(df_proj, x = 0, y = 1, z = 2, opacity=0.7, width=500, height=500)
# fig_scaled.show()

In [None]:
# nb_clusters_to_try = np.arange(1,10+1,1)

In [None]:
# wcss = []
# for K in nb_clusters_to_try:
#     kmeans = KMeans(n_clusters = K)
#     kmeans.fit(df_proj)
#     wcss.append(kmeans.inertia_)

In [None]:
# elbow_highlight = 3
# with plt.style.context('seaborn-deep'):
#     # figsize
#     plt.figure(figsize=(20,10))
#     # getting axes
#     ax = plt.gca()
#     # plotting
#     ax.plot(nb_clusters_to_try, wcss,color="blue",linestyle=":",marker="D",label="Inertia")
#     # customizing
#     ax.legend(loc="upper right")
#     ax.set_title('The Elbow Method')
#     ax.set_xticks(nb_clusters_to_try)
#     ax.set_xlabel('Number of clusters')
#     ax.set_ylabel('Within-Cluster Sums of Squares')
#     ax.scatter(elbow_highlight,wcss[elbow_highlight-1],c='red',s=400)

#     ax.grid(axis="y",linewidth=0.5)
#     plt.show()