**importing the goods**

In [101]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

In [133]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
!pip install nltk



In [104]:
import string
import nltk
string.punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [105]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jonnyoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jonnyoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonnyoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jonnyoh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

**Getting our dataframe**

In [106]:
filename = 'omfh.csv'
data_folder_path = os.path.join(os.getcwd(), '..', 'raw_data')
file_path = os.path.join(data_folder_path, filename)

df = pd.read_csv(file_path)


In [107]:
def save_df_to_csv(df, filename):
    """ Saves the given DataFrame to a CSV file in the current directory"""
    file_path = os.path.join(os.getcwd(), filename)

    df.to_csv(file_path, index=False)
    print(f"DataFrame saved to {file_path}")

In [108]:
save_df_to_csv(df,filename)

DataFrame saved to /home/jonnyoh/code/cipobt/breathworks/notebooks/omfh.csv


In [109]:
columns = df.columns

In [140]:
textual = ['PersonalHistory', 'Motivation', 'ReferralSource', 'Daily20MPractice']
catagorical = ['CourseType', 'Gender', 'Ethnicity', 'Location']
datetime = ['CourseDate', 'EnrollmentDate', 'DoB']
non_textual = catagorical + datetime

In [112]:
class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cleaned_data = X.applymap(self.clean_text)
        return cleaned_data

    def clean_text(self, text):
        text = str(text)
        for punctuation in string.punctuation:
            text = text.replace(punctuation, ' ')  # Remove Punctuation
        lowercased = text.lower()  # Lower Case
        tokenized = word_tokenize(lowercased)  # Tokenize
        words_only = [word for word in tokenized if word.isalpha()]  # Remove numbers

        stop_words = set(stopwords.words('english'))
        stop_words.update(['yes','none'])

        without_stopwords = [word for word in words_only if not word in stop_words]  # Remove Stop Words
        lemma = WordNetLemmatizer()  # Initiate Lemmatizer
        lemmatized = [lemma.lemmatize(word) for word in without_stopwords]  # Lemmatize
        cleaned = ' '.join(lemmatized)  # Join back to a string
        return cleaned

In [113]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TextCleaner(), textual)
    ],
    remainder='passthrough'  # Non-specified columns will be passed through unchanged
)

In [114]:
df_transformed = preprocessor.fit_transform(df)

In [138]:
transformed_columns = textual + [col for col in df.columns if col not in textual]

In [143]:
df_cleaned = pd.DataFrame(df_transformed, columns=transformed_columns)
df_cleaned

Unnamed: 0,PersonalHistory,Motivation,ReferralSource,Daily20MPractice,CourseDate,CourseType,EnrollmentDate,Gender,Ethnicity,DoB,Location,Communications
0,issue fatigue stress currently identify chroni...,occupational therapist working nh patient long...,website,,2024-04-03,OMfH,2024-02-22,Female,"White (including White British, Irish, Gypsy o...",08/03/1967,"Magdalen Square, Liverpool","Ongoing support communications sign up, I woul..."
1,migraine disc hernia neck undiagnosed knee pai...,joining course hope cn find useful technique h...,internet search,,2024-06-02,OMfH,2024-02-22,Female,"White (including White British, Irish, Gypsy o...",29/11/1981,,"Ongoing support communications sign up, I woul..."
2,auto immune issue experience frequent gastric ...,always high stress consistently advised someth...,internet search,,2024-03-07,OMfH,2024-02-20,Female,"White (including White British, Irish, Gypsy o...",07/03/1975,,Ongoing support communications sign up
3,fm long covid experienced persistent pain last...,looking forward joining course fm also long co...,work work colleague,,2024-03-07,OMfH,2024-02-20,Female,"White (including White British, Irish, Gypsy o...",09/06/1974,Merseyside,"Ongoing support communications sign up, I woul..."
4,experienced persistent pain lasted least last ...,suffering chronic pain hope manage better,book,,2024-03-07,OMfH,2024-02-19,Female,"White (including White British, Irish, Gypsy o...",03/09/1966,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1393,currently identify chronic pain condition,practicing mindfulness since mid also teaching...,website,,2015-01-12,OMfH,2014-12-12,,,,,
1394,currently identify chronic pain condition,taking course pre requisite breathworks course...,friend family colleague,,2015-01-12,OMfH,2014-12-12,,,,,
1395,currently identify chronic pain condition,would like attend breathworks training given c...,friend family colleague,,2015-01-12,OMfH,2014-12-10,,,,,
1396,currently identify chronic pain condition,meditating two year inconsistently using cd bo...,website,,2015-02-02,OMfH,2014-11-24,,,,,


In [144]:
df_dropped = df_cleaned.drop(columns=non_textual)

In [146]:
df_dropped

Unnamed: 0,PersonalHistory,Motivation,ReferralSource,Daily20MPractice,Communications
0,issue fatigue stress currently identify chroni...,occupational therapist working nh patient long...,website,,"Ongoing support communications sign up, I woul..."
1,migraine disc hernia neck undiagnosed knee pai...,joining course hope cn find useful technique h...,internet search,,"Ongoing support communications sign up, I woul..."
2,auto immune issue experience frequent gastric ...,always high stress consistently advised someth...,internet search,,Ongoing support communications sign up
3,fm long covid experienced persistent pain last...,looking forward joining course fm also long co...,work work colleague,,"Ongoing support communications sign up, I woul..."
4,experienced persistent pain lasted least last ...,suffering chronic pain hope manage better,book,,
...,...,...,...,...,...
1393,currently identify chronic pain condition,practicing mindfulness since mid also teaching...,website,,
1394,currently identify chronic pain condition,taking course pre requisite breathworks course...,friend family colleague,,
1395,currently identify chronic pain condition,would like attend breathworks training given c...,friend family colleague,,
1396,currently identify chronic pain condition,meditating two year inconsistently using cd bo...,website,,


In [154]:
vectorizer = CountVectorizer()
df_dropped.PersonalHistory

0       issue fatigue stress currently identify chroni...
1       migraine disc hernia neck undiagnosed knee pai...
2       auto immune issue experience frequent gastric ...
3       fm long covid experienced persistent pain last...
4       experienced persistent pain lasted least last ...
                              ...                        
1393            currently identify chronic pain condition
1394            currently identify chronic pain condition
1395            currently identify chronic pain condition
1396            currently identify chronic pain condition
1397            currently identify chronic pain condition
Name: PersonalHistory, Length: 1398, dtype: object

In [155]:
def print_topics(model, vectorizer, num_topics=3, num_words=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
               for i in topic.argsort()[:-num_words - 1:-1]])


In [159]:
for column in textual:
    # Transform the data
    data_vectorized = vectorizer.fit_transform(df[column].values.astype('U'))

    # Initialize and fit the LDA model
    lda_model = LatentDirichletAllocation(n_components=3, random_state=0)
    lda_vectors = lda_model.fit_transform(data_vectorized)

    # Print the topics found by the LDA model
    print(f"____________________Topics for {column}:_____________________")
    print_topics(lda_model, vectorizer)

!!!***~owo~_____________Topics for PersonalHistory:______________~owo~***!!!
Topic 0:
[('no', 1278.5176865947878), ('condition', 1234.770191432579), ('don', 1224.6630685452412), ('chronic', 1214.56581976115), ('as', 1177.6361010073347), ('currently', 1171.139067297414), ('having', 1170.136342778875), ('identify', 1151.04518953096), ('pain', 1107.9812979853448), ('none', 17.171189481030193)]
Topic 1:
[('or', 436.21233553146703), ('depression', 261.3022154633636), ('health', 246.11451454998672), ('mental', 238.45449750350954), ('acute', 229.29154181766873), ('pain', 228.89819059255817), ('debilitating', 224.47089435761296), ('experience', 223.46187192984056), ('other', 221.42972120240455), ('any', 211.4599299741606)]
Topic 2:
[('and', 924.0229084276606), ('the', 692.171150773056), ('have', 638.5834408659448), ('to', 569.0812238148733), ('my', 477.7134232451674), ('pain', 457.120511422088), ('of', 439.8763404826158), ('in', 347.7425367414486), ('for', 345.6898726294539), ('with', 340.4535

In [162]:
example = ["I am looking for more help with my a connection between my mind and body, and maybe for someone to teach me ways to stay more calm and mindfull, i don't suffer from that much physical pain"]