# Importing the goods

In [36]:
import pandas as pd
import os

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
import string
import nltk
string.punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [39]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jonnyoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jonnyoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonnyoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonnyoh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Downloading file

In [40]:
# filename = 'omfh.csv'
filename = 'combined_courses3.csv'
data_folder_path = os.path.join(os.getcwd(), '..', 'raw_data')
file_path = os.path.join(data_folder_path, filename)

df = pd.read_csv(file_path)


In [41]:
def save_df_to_csv(df, filename):
    """ Saves the given DataFrame to a CSV file in the current directory"""
    file_path = os.path.join(os.getcwd(), filename)

    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}")

In [42]:
save_df_to_csv(df,filename)

DataFrame saved to /home/jonnyoh/code/cipobt/breathworks/notebooks/combined_courses3.csv


In [43]:
columns = df.columns

In [44]:
textual = ['PersonalHistory', 'Motivation', 'ReferralSource', 'Daily20MPractice']
catagorical = ['CourseType', 'Gender', 'Ethnicity', 'Location']
datetime = ['CourseDate', 'EnrollmentDate', 'DoB']
non_textual = catagorical + datetime

# Define the Text cleaner

In [45]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cleaned_data = X.applymap(self.clean_text)
        return cleaned_data

    def clean_text(self, text):
        text = str(text)
        for punctuation in string.punctuation:
            text = text.replace(punctuation, ' ')  # Remove Punctuation
        lowercased = text.lower()  # Lower Case
        tokenized = word_tokenize(lowercased)  # Tokenize
        words_only = [word for word in tokenized if word.isalpha()]  # Remove numbers

        stop_words = set(stopwords.words('english'))
        stop_words.update(['yes','none','nan'])

        without_stopwords = [word for word in words_only if not word in stop_words]  # Remove Stop Words
        lemma = WordNetLemmatizer()  # Initiate Lemmatizer
        lemmatized = [lemma.lemmatize(word) for word in without_stopwords]  # Lemmatize
        cleaned = ' '.join(lemmatized)  # Join back to a string
        return cleaned

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TextCleaner(), textual)
    ],
    remainder='passthrough'  # Non-specified columns will be passed through unchanged
)

### Create the new dataframe

In [47]:
df_transformed = preprocessor.fit_transform(df)
transformed_columns = textual + [col for col in df.columns if col not in textual]
df_cleaned = pd.DataFrame(df_transformed, columns=transformed_columns)
df_dropped = df_cleaned.drop(columns=non_textual)

In [48]:
vectorizer = CountVectorizer(stop_words='english')

### Print the topics

In [49]:
def print_topics(model, vectorizer, num_topics=3, num_words=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
               for i in topic.argsort()[:-num_words - 1:-1]])


In [50]:
for column in textual:
    data_vectorized = vectorizer.fit_transform(df_dropped[column].values.astype('U'))

    # Initialize and fit the LDA model
    lda_model = LatentDirichletAllocation(n_components=2)
    lda_vectors = lda_model.fit_transform(data_vectorized)

    # Print the topics found by the LDA model
    print(f"____________________Topics for {column}:_____________________")
    print_topics(lda_model, vectorizer)

____________________Topics for PersonalHistory:_____________________
Topic 0:
[('pain', 2098.4634167563845), ('condition', 2082.697315811877), ('chronic', 2031.201665968884), ('currently', 1825.1894254610982), ('identify', 1731.1338242147767), ('syndrome', 29.985498164230346), ('fibromyalgia', 28.922260090014028), ('fatigue', 25.348274817955772), ('debilitating', 17.876284892257047), ('migraine', 17.231204837177213)]
Topic 1:
[('depression', 952.5171401161873), ('anxiety', 810.6642378003028), ('pain', 480.53658324360447), ('year', 448.3784845639723), ('health', 426.04646326820676), ('experienced', 379.44909356532145), ('experience', 371.367224257769), ('month', 352.45722893103544), ('mental', 331.1912105360553), ('debilitating', 324.1237151077334)]
____________________Topics for Motivation:_____________________
Topic 0:
[('mindfulness', 1832.500541750095), ('course', 1590.46564732921), ('teacher', 861.996040152912), ('practice', 762.0346737509859), ('training', 714.3433375027163), ('li

In [51]:
combined_text = df_dropped[textual].apply(lambda x: ' '.join(x.dropna().values.astype('U')), axis=1)
data_vectorized = vectorizer.fit_transform(combined_text)

lda_model = LatentDirichletAllocation(n_components=7, random_state=0)
lda_vectors = lda_model.fit_transform(data_vectorized)

# Modified print_topics function
def print_topics(model, vectorizer, num_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([(vectorizer.get_feature_names_out()[i], topic[i])
               for i in topic.argsort()[:-num_words - 1:-1]])

# Step 4: Print the topics found by the LDA model across all combined textual data
print("____________________Combined Topics:_____________________")
print_topics(lda_model, vectorizer)

____________________Combined Topics:_____________________
Topic 0:
[('level', 30.444018749889697), ('stress', 28.459953873615927), ('help', 22.510444166322326), ('time', 18.20889720755411), ('medication', 16.05968327517491), ('diagnosed', 14.438841008658946), ('group', 13.235709713041528), ('suffered', 12.730361680436662), ('need', 12.671311511625706), ('thing', 12.264988080969657)]
Topic 1:
[('pain', 377.25465239943145), ('month', 145.83705867721366), ('experienced', 140.37163625163092), ('persistent', 136.62741357732955), ('lasted', 128.82662265184243), ('year', 127.35155396402565), ('help', 72.50116053691669), ('feel', 71.94432611292183), ('time', 71.52398219827296), ('like', 71.3543917102478)]
Topic 2:
[('mindfulness', 2012.876149947978), ('course', 1531.3735743381533), ('practice', 1210.6285605525848), ('teacher', 704.2978471957664), ('currently', 673.1955012536064), ('day', 644.9272699663189), ('pain', 635.5770667326938), ('able', 614.2426864233403), ('condition', 604.44901923964