In [31]:
# Data Manipulation
import numpy as np
import pandas as pd
import os

# Data Visualisation
import matplotlib.pyplot as plt
# Pipeline and Column Transformers
from sklearn import set_config

# Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Cross Validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# Unsupervised Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# STATISTICS
from statsmodels.graphics.gofplots import qqplot

# Text Processing
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# NLTK Downloads
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Set pandas display option
pd.set_option('display.max_columns', None)

# Set sklearn display configuration
set_config(display = "diagram")

# Custom Transformers and Model Building
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [32]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
drop_columns=['Daily20MPractice', 'ReferralSource','Location','DoB']
textual_columns = ['PersonalHistory', 'Motivation']
categorical_columns = ['CourseType', 'Gender', 'Ethnicity']
datetime_columns = ['Days_Since_EarliestCourse', 'AgeAtCourse']
non_textual = categorical_columns + datetime_columns


In [34]:
import sys
sys.path.append('../')
from utils import get_data
from breathworks.clustering.preprocessing import build_preprocessor
from breathworks.clustering.cleaning import clean_data, clean_textual_columns, advanced_cleaning
from breathworks.clustering.LDA import splitting_into_topics, lda_visual
from breathworks.clustering.plots import corr_plot, plot_clusters, plot_clusters_2d, plot_clusters_3d
from breathworks.clustering.clustering import label_dataframe, fit_kmeans_and_label, plot_lda
from breathworks.clustering.config import drop_columns, textual_columns, categorical_columns, datetime_columns, to_drop, topics_per_column, column_pairs

In [35]:
# # Fetch and clean data
# dataframe = get_data()
# processed_data = clean_data(dataframe,drop_columns)
# df_transformed = clean_textual_columns(processed_data, textual_columns)

# # # Apply filters
# # df_filtered = df_transformed[(df_transformed['Gender'] == 'Male') &
# #                              (df_transformed['CourseType'].isin(['OMfH','OMfH'])) &
# #                              (df_transformed['Ethnicity'] == 'White')]

# # Apply the transformations for LDA
# df_transformed = df_transformed.drop(columns=to_drop)
# df_split = splitting_into_topics(df_transformed,topics_per_column,textual_columns)
# preprocessor = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
# df_LDA = preprocessor.fit_transform(df_split)

# # final df with correct column names
# transformed_columns = preprocessor.get_feature_names_out()
# df_final = pd.DataFrame(df_LDA, columns=transformed_columns)
# df_final = df_final.apply(pd.to_numeric)

# # df_2d = df_final[[col1b,col2a]]
# # labelling = fit_kmeans_and_label(df_2d,4)
# # label_dataframe(df_2d, labelling)

# # print the clusters with their labels
# plot_lda(df_final,column_pairs)


In [36]:
# get the data
dataframe = get_data()



Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.



In [37]:
# clean data
processed_data = clean_data(dataframe,drop_columns)
df_transformed = clean_textual_columns(processed_data, textual_columns)

print(df_transformed.info())
df_transformed.head(3)

<class 'pandas.core.frame.DataFrame'>
Index: 3260 entries, 0 to 3351
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   CourseType                 3260 non-null   object
 1   Gender                     3260 non-null   object
 2   Ethnicity                  3260 non-null   object
 3   AgeAtCourse                3260 non-null   int64 
 4   PersonalHistory            3260 non-null   object
 5   Motivation                 3260 non-null   object
 6   Days_Since_EarliestCourse  3260 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 203.8+ KB
None


Unnamed: 0,CourseType,Gender,Ethnicity,AgeAtCourse,PersonalHistory,Motivation,Days_Since_EarliestCourse
0,IMfH,Male,White,41,mental health problem year learn meditation mi...,volunteer meditation teacher gain ground weste...,2111
1,IMfH,Female,Other,61,suffer complex regional pain syndrome type dis...,read book live well pain illness discomfort ex...,1346
2,IMfH,Female,White,38,chronic stomach pain break back leg pain anxie...,sister find online suggest try course chronic ...,1346


In [38]:
df_transformed.isnull().sum()

CourseType                   0
Gender                       0
Ethnicity                    0
AgeAtCourse                  0
PersonalHistory              0
Motivation                   0
Days_Since_EarliestCourse    0
dtype: int64

In [39]:
# filter the df
in_person = ['IMfH', 'IMfS', 'I5DMfH']
online = ['OMfH', 'OMfS']
all_course_types = in_person + online
to_drop = ['Gender', 'CourseType', 'Ethnicity']
# df_filtered = df_transformed[(df_transformed['Gender'] == 'Male') &
#                                 (df_transformed['CourseType'].isin(all_course_types)) &
#                                 (df_transformed['Ethnicity'] == 'White')]

# df_filtered = df_filtered.drop(columns=to_drop)

# print(df_filtered.info())
# df_filtered.head(3)

In [40]:
categorical_columns = [column for column in categorical_columns if column not in to_drop]

In [41]:
topics_per_column = {
    'PersonalHistory': 2,
    'Motivation': 2,
}

In [42]:
# Apply the transformations for LDA
df_transformed = df_transformed.drop(columns=to_drop, errors='ignore')

df_split, lda_details = splitting_into_topics(df_transformed,topics_per_column,textual_columns)
preprocessor = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
df_LDA = preprocessor.fit_transform(df_split)

PersonalHistory
Topic 0:
[('pain', 123.53561695423708), ('depression', 90.27660913435739), ('health', 84.84040008756523), ('experience', 82.83667504153239), ('condition', 82.53424012551925), ('debilitate', 76.45990836228532), ('acute', 74.58920774120122), ('mental', 73.23708680726075), ('mental health', 68.35914996146715), ('month', 62.88920683675179)]
PersonalHistory
Topic 1:
[('anxiety', 165.91388669406183), ('depression', 76.2599656168603), ('suffer', 74.55722412233256), ('year', 62.81682157948682), ('stress', 59.90963684093044), ('time', 58.482507514359355), ('life', 46.91892478434301), ('work', 45.57992639611044), ('depression anxiety', 45.473189506184866), ('manage', 45.127439276604264)]
Motivation
Topic 0:
[('mindfulness', 181.01983882043902), ('course', 176.57819005752737), ('teacher', 124.43336671847874), ('practice', 123.69591186445787), ('train', 118.99665204358566), ('breathworks', 92.46362969948046), ('teacher train', 83.76982077455592), ('meditation', 77.15882545302178), 

In [43]:
# # create v2
# phrase_to_exclude = "No, currently I don't identify as having a chronic pain condition."

# # Create a mask for rows where 'personalhistory' column's value is exactly the phrase_to_exclude
# mask = df_transformed['PersonalHistory'] != phrase_to_exclude

# # Apply the mask to filter out the rows
# df_transformed_filtered = df_transformed[mask]

# # Display the information and the first 3 rows of the filtered DataFrame
# print(df_transformed_filtered.info())
# df_transformed_filtered.head(3)

In [44]:
# # Apply the transformations for LDA for v2
# df_transformed_filtered = df_transformed_filtered.drop(columns=to_drop, errors='ignore')

# df_split_2, lda_details_2 = splitting_into_topics(df_transformed_filtered,topics_per_column,textual_columns)
# preprocessor_2 = build_preprocessor(textual_columns, categorical_columns, datetime_columns)
# df_LDA_2 = preprocessor_2.fit_transform(df_split_2)

In [45]:
# transformed_columns_2 = preprocessor_2.get_feature_names_out()
# df_final_2 = pd.DataFrame(df_LDA_2, columns=transformed_columns_2)
# df_final_2 = df_final_2.apply(pd.to_numeric)

# print(df_final_2.info())
# df_final_2.head(3)

In [46]:
lda_ph =lda_details['PersonalHistory']['lda']
x_ph = lda_details['PersonalHistory']['X']
vect_ph = lda_details['PersonalHistory']['vect']

lda_visual(lda_ph, x_ph, vect_ph)

In [47]:
lda_m =lda_details['Motivation']['lda']
x_m = lda_details['Motivation']['X']
vect_m = lda_details['Motivation']['vect']

lda_visual(lda_m, x_m, vect_m)

In [48]:
transformed_columns = preprocessor.get_feature_names_out()
df_final = pd.DataFrame(df_LDA, columns=transformed_columns)
df_final = df_final.apply(pd.to_numeric)

print(df_final.info())
df_final.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3260 entries, 0 to 3259
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   num__Days_Since_EarliestCourse     3260 non-null   float64
 1   num__AgeAtCourse                   3260 non-null   float64
 2   remainder__PersonalHistory_Topic0  3260 non-null   float64
 3   remainder__PersonalHistory_Topic1  3260 non-null   float64
 4   remainder__Motivation_Topic0       3260 non-null   float64
 5   remainder__Motivation_Topic1       3260 non-null   float64
dtypes: float64(6)
memory usage: 152.9 KB
None


Unnamed: 0,num__Days_Since_EarliestCourse,num__AgeAtCourse,remainder__PersonalHistory_Topic0,remainder__PersonalHistory_Topic1,remainder__Motivation_Topic0,remainder__Motivation_Topic1
0,0.392517,-0.888889,0.423102,0.576898,0.606181,0.393819
1,-0.109616,1.333333,0.416068,0.583932,0.276188,0.723812
2,-0.109616,-1.222222,0.360618,0.639382,0.298329,0.701671


In [49]:
# corrolation stuff
# df_new = remove_low_variance_features(df_final)
# df_new_2 = remove_high_correlation_features(df_new)

# corr_df = df_final.corr()
# for idx, col in corr_df.iterrows():
#     if abs(col) >= 0.25 :
#         print(col)

# corr_plot(df_final)

# # PCA transform
# df_proj, labels = transform_data(df_final, 4, 2)

# print(df_proj.info())
# df_proj.head(3)

In [50]:
# plot_clusters(df_proj, labels)

In [51]:
col1a='remainder__PersonalHistory_Topic0'
col2a='remainder__Motivation_Topic0'
col1b='remainder__PersonalHistory_Topic1'
col2b='remainder__Motivation_Topic1'
col1c='remainder__PersonalHistory_Topic2'
col2c='remainder__Motivation_Topic2'

In [61]:
df_3d = df_final[[col1a,col2a,col2b]]
labelling_3d = fit_kmeans_and_label(df_3d,4)
label_dataframe(df_3d, labelling_3d)
print(df_3d.info())
df_3d.head(3)





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3260 entries, 0 to 3259
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   remainder__PersonalHistory_Topic0  3260 non-null   float64
 1   remainder__Motivation_Topic0       3260 non-null   float64
 2   remainder__Motivation_Topic1       3260 non-null   float64
dtypes: float64(3)
memory usage: 76.5 KB
None


Unnamed: 0,remainder__PersonalHistory_Topic0,remainder__Motivation_Topic0,remainder__Motivation_Topic1
0,0.423102,0.606181,0.393819
1,0.416068,0.276188,0.723812
2,0.360618,0.298329,0.701671


In [53]:
# pd.concat([df_3d,df_final[['PersonalHistory']]])

In [62]:
plot_clusters_3d(df_3d,labelling_3d)


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [55]:
# df_2d = df_final[[col1b,col2a]]
# labelling = fit_kmeans_and_label(df_2d,4)
# label_dataframe(df_2d, labelling)
# print(df_2d.info())
# df_2d.head(3)

In [56]:
# plot_clusters_2d(df_2d,labelling)

In [57]:
column_pairs = [
    (col1a, col2a),
    (col1a, col2b),
    # (col1a, col2c),
    (col1b, col2a),
    (col1b, col2b),
    # (col1b, col2c),
    # (col1c, col2a),
    # (col1c, col2b),
    # (col1c, col2c),
]

In [58]:
plot_lda(df_final,column_pairs)












distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [59]:
# plot_lda(df_final_2,column_pairs)

In [60]:
avatars = {}
df_labelled = pd.concat([df_transformed,pd.Series(labelling)],axis=1).rename(columns={0:"label"})

for numero_cluster in np.unique(labelling):
    avatars[numero_cluster] = df_labelled[df_labelled.label == numero_cluster]

for key,value in avatars.items():
    print("-"*50)
    print(f"Here are some people fitting into Avatar {key}")
    print("-"*50)
    display(value.sample(10))

NameError: name 'labelling' is not defined

In [None]:
sys.exit()

SystemExit: 


To exit: use 'exit', 'quit', or Ctrl-D.



In [None]:
# df['Location'] = df['Location'].apply(clean_text)

In [None]:
# def get_location_category(location):
#     if 'manchester' in location:
#         return 'Manchester'
#     elif 'liverpool' in location or 'merseyside' in location:
#         return 'Liverpool'
#     elif 'london' in location:
#         return 'London'
#     elif 'united states' in location or 'utah' in location:
#         return 'United States'
#     elif 'denmark' in location or 'croatia' in location or 'poland' in location or 'norway' in location or 'germany' in location or 'barcelona' in location:
#         return 'EUR'
#     elif 'australia' in location:
#         return 'Australia'
#     elif 'india' in location or 'maldives' in location:
#         return 'SAsia'
#     elif 'uruguay' in location:
#         return 'SAmerica'
#     elif 'united kingdom' in location:
#         return 'UK'
#     else:
#         return 'England'

# df_drop['Location_Category'] = df_drop['Location'].apply(get_location_category)
# df_drop.Location_Category.value_counts()


In [None]:
# class TextCleaner(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         cleaned_data = X.applymap(self.clean_text)
#         return cleaned_data

#     def clean_text(self, text):
#         text = str(text)
#         for punctuation in string.punctuation:
#             text = text.replace(punctuation, ' ')  # Remove Punctuation
#         lowercased = text.lower()  # Lower Case
#         tokenized = word_tokenize(lowercased)  # Tokenize
#         words_only = [word for word in tokenized if word.isalpha()]  # Remove numbers

#         stop_words = set(stopwords.words('english'))
#         stop_words.update(['yes','none','nan'])

#         without_stopwords = [word for word in words_only if not word in stop_words]  # Remove Stop Words
#         lemma = WordNetLemmatizer()  # Initiate Lemmatizer
#         lemmatized = [lemma.lemmatize(word) for word in without_stopwords]  # Lemmatize
#         cleaned = ' '.join(lemmatized)  # Join back to a string
#         return cleaned

In [None]:
# pca = PCA()
# pca.fit(df_num)
# threhsold_pca = 4
# with plt.style.context('seaborn-deep'):
#     # figsize
#     plt.figure(figsize=(10,6))
#     # getting axes
#     ax = plt.gca()
#     # plotting
#     explained_variance_ratio_cumulated = np.cumsum(pca.explained_variance_ratio_)
#     x_axis_ticks = np.arange(1,explained_variance_ratio_cumulated.shape[0]+1)
#     ax.plot(x_axis_ticks,explained_variance_ratio_cumulated,label="cumulated variance ratio",color="purple",linestyle=":",marker="D",markersize=10)
#     # customizing
#     ax.set_xlabel('Number of Principal Components')
#     ax.set_ylabel('% cumulated explained variance')
#     ax.legend(loc="upper left")
#     ax.set_title('The Elbow Method')
#     ax.set_xticks(x_axis_ticks)
#     ax.scatter(threhsold_pca,explained_variance_ratio_cumulated[threhsold_pca-1],c='blue',s=400)
#     ax.grid(axis="x",linewidth=0.5)
#     ax.grid(axis="y",linewidth=0.5)

In [None]:
# fig_scaled = px.scatter_3d(df_proj, x = 0, y = 1, z = 2, opacity=0.7, width=500, height=500)
# fig_scaled.show()

In [None]:
# nb_clusters_to_try = np.arange(1,10+1,1)

In [None]:
# wcss = []
# for K in nb_clusters_to_try:
#     kmeans = KMeans(n_clusters = K)
#     kmeans.fit(df_proj)
#     wcss.append(kmeans.inertia_)

In [None]:
# elbow_highlight = 3
# with plt.style.context('seaborn-deep'):
#     # figsize
#     plt.figure(figsize=(20,10))
#     # getting axes
#     ax = plt.gca()
#     # plotting
#     ax.plot(nb_clusters_to_try, wcss,color="blue",linestyle=":",marker="D",label="Inertia")
#     # customizing
#     ax.legend(loc="upper right")
#     ax.set_title('The Elbow Method')
#     ax.set_xticks(nb_clusters_to_try)
#     ax.set_xlabel('Number of clusters')
#     ax.set_ylabel('Within-Cluster Sums of Squares')
#     ax.scatter(elbow_highlight,wcss[elbow_highlight-1],c='red',s=400)

#     ax.grid(axis="y",linewidth=0.5)
#     plt.show()