In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:

# Load the dataset
file_path = '/content/drive/MyDrive/Heat Waves/survey_combined -final.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Full Name,Date of Birth,Gender,Current Weight (kg),Health Conditions,Allergies,General Health,Diuretics Medication,Diuretics Name & Dosage,Beta-Blockers Medication,Beta-Blockers Name & Dosage,Other Medications,Medical Records Consent,Typical Workday in Heatwave,Determine Water Intake,Medication Effect on Hydration,Feel When Dehydrated,Severe Dehydration Experience,Hydration Habits,Tools for Monitoring Hydration
0,Sharmila_201,4/8/2001,Other,98,Hypertension;Diabetes,Yes,Poor,No,Med_C 10mg,Yes,Med_Z 10mg,Med_Q 25mg,Yes,"I'm mostly indoors, but I do feel the heat int...",I have a smart bottle that reminds me to drink...,I haven't noticed any significant changes.,My skin feels dry and I get a headache.,I felt extremely weak and had to take a day of...,I drink more water in the morning and evening....,I have a marked water bottle that shows hourly...
1,Nimal_202,11/2/1991,Female,47,Heart Failure,Yes,Very Good,Yes,,Yes,Med_Z 10mg,Med_Q 25mg,No,I usually work outdoors for about 6 hours. The...,I carry a 1-liter bottle and ensure I refill i...,I haven't noticed any significant changes.,I feel tired and have difficulty concentrating.,I had severe cramps and had to be hospitalized...,I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...
2,Rohan_203,5/1/1985,Other,107,None;Hypertension;Diabetes,Yes,Poor,No,Med_A 50mg,Yes,,Med_Q 25mg,No,I usually work outdoors for about 6 hours. The...,I carry a 1-liter bottle and ensure I refill i...,I haven't noticed any significant changes.,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...
3,Rohan_204,11/11/1964,Female,80,Diabetes;Hypertension;Heart Failure,Yes,Good,Yes,Med_A 50mg,Yes,Med_Y 20mg,Med_R 15mg,No,I usually work outdoors for about 6 hours. The...,I have a smart bottle that reminds me to drink...,I feel more thirsty when I'm on my medication ...,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...
4,Priyanka_205,19/2/1966,Male,73,Kidney Disorders,No,Poor,Yes,Med_A 50mg,Yes,,Med_R 15mg,Yes,"I'm mostly indoors, but I do feel the heat int...","I rely on my thirst. When I feel parched, I dr...",I haven't noticed any significant changes.,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I use a mobile app that reminds me to drink wa...


In [None]:
df.columns

Index(['Full Name', 'Date of Birth', 'Gender', 'Current Weight (kg)',
       'Health Conditions', 'Allergies', 'General Health',
       'Diuretics Medication', 'Diuretics Name & Dosage',
       'Beta-Blockers Medication', 'Beta-Blockers Name & Dosage',
       'Other Medications', 'Medical Records Consent',
       'Typical Workday in Heatwave', 'Determine Water Intake',
       'Medication Effect on Hydration', 'Feel When Dehydrated',
       'Severe Dehydration Experience', 'Hydration Habits',
       'Tools for Monitoring Hydration'],
      dtype='object')

In [None]:
# prompt: if the entire row is empty,then drop that rows in df

df = df.dropna(how='all')


In [None]:
df.shape

(100, 20)

In [None]:
numerical_fields = ['Current Weight (kg)','Typical Workday in Heatwave', 'Determine Water Intake', 'Medication Effect on Hydration', 'Feel When Dehydrated', 'Severe Dehydration Experience', 'Hydration Habits', 'Tools for Monitoring Hydration']
categorical_fields = ['Gender', 'Diuretics Medication', 'Beta-Blockers Medication', 'Medical Records Consent']
text_columns = [
    'Typical Workday in Heatwave',
    'Determine Water Intake',
    'Medication Effect on Hydration',
    'Feel When Dehydrated',
    'Severe Dehydration Experience',
    'Hydration Habits',
    'Tools for Monitoring Hydration'
]
# Extract numerical and categorical data
numerical_data = df[numerical_fields]
categorical_data = df[categorical_fields]

# One-hot encode categorical data
categorical_data_encoded = pd.get_dummies(categorical_data)

# Combine numerical and encoded categorical data
X = pd.concat([numerical_data, categorical_data_encoded], axis=1)

In [None]:
# Separate numerical and categorical fields
numerical_fields = ['Current Weight (kg)']
categorical_fields = ['Gender', 'Diuretics Medication', 'Beta-Blockers Medication', 'Medical Records Consent']
text_columns = [
    'Typical Workday in Heatwave',
    'Determine Water Intake',
    'Medication Effect on Hydration',
    'Feel When Dehydrated',
    'Severe Dehydration Experience',
    'Hydration Habits',
    'Tools for Monitoring Hydration'
]

# Function to preprocess text data
def preprocess_text(text):
    # Lowercasing
    text = str(text).lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization (implicitly done by CountVectorizer later)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Enhanced extract_topics function with preprocessing
def extract_topics(data, n_topics=3, n_features=1000):
    # Preprocess the text data
    preprocessed_data = data.apply(preprocess_text)

    # Vectorization with TF-IDF to give less weight to more common words
    vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')
    dtm = vectorizer.fit_transform(preprocessed_data)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(dtm)

    # Get the topic distribution for each document
    topic_distribution = lda.transform(dtm)
    return topic_distribution

for column in text_columns:
    topics = extract_topics(df[column])
    for i in range(topics.shape[1]):
        X[f'{column}_Topic_{i}'] = topics[:, i]

X= X.drop(columns=text_columns)



In [None]:
topics

array([[0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10214078, 0.80004323, 0.09781599],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10214078, 0.80004323, 0.09781599],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10214078, 0.80004323, 0.09781599],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.79848189, 0.10162098, 0.09989713],
       [0.10582609, 0.1037842 , 0.79038971],
       [0.

In [None]:
X.columns

Index(['Current Weight (kg)', 'Gender_Female', 'Gender_Male', 'Gender_Other',
       'Gender_Prefer not to say', 'Diuretics Medication_No',
       'Diuretics Medication_Yes', 'Beta-Blockers Medication_No',
       'Beta-Blockers Medication_Yes', 'Medical Records Consent_No',
       'Medical Records Consent_Yes', 'Typical Workday in Heatwave_Topic_0',
       'Typical Workday in Heatwave_Topic_1',
       'Typical Workday in Heatwave_Topic_2', 'Determine Water Intake_Topic_0',
       'Determine Water Intake_Topic_1', 'Determine Water Intake_Topic_2',
       'Medication Effect on Hydration_Topic_0',
       'Medication Effect on Hydration_Topic_1',
       'Medication Effect on Hydration_Topic_2',
       'Feel When Dehydrated_Topic_0', 'Feel When Dehydrated_Topic_1',
       'Feel When Dehydrated_Topic_2', 'Severe Dehydration Experience_Topic_0',
       'Severe Dehydration Experience_Topic_1',
       'Severe Dehydration Experience_Topic_2', 'Hydration Habits_Topic_0',
       'Hydration Habits_

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-Means clustering to the scaled features
kmeans_combined = KMeans(n_clusters=3, random_state=42)
kmeans_combined.fit(X_scaled)  # Use X_scaled directly

# Assign the clusters as the predicted body water levels with combined features
df['Predicted Body Water Level Combined'] = kmeans_combined.labels_

# Check the distribution of predicted body water levels with combined features
print(df['Predicted Body Water Level Combined'].value_counts())




1    48
2    29
0    23
Name: Predicted Body Water Level Combined, dtype: int64


In [None]:
df.head()

Unnamed: 0,Full Name,Date of Birth,Gender,Current Weight (kg),Health Conditions,Allergies,General Health,Diuretics Medication,Diuretics Name & Dosage,Beta-Blockers Medication,...,Other Medications,Medical Records Consent,Typical Workday in Heatwave,Determine Water Intake,Medication Effect on Hydration,Feel When Dehydrated,Severe Dehydration Experience,Hydration Habits,Tools for Monitoring Hydration,Predicted Body Water Level Combined
0,Sharmila_201,4/8/2001,Other,98,Hypertension;Diabetes,Yes,Poor,No,Med_C 10mg,Yes,...,Med_Q 25mg,Yes,"I'm mostly indoors, but I do feel the heat int...",I have a smart bottle that reminds me to drink...,I haven't noticed any significant changes.,My skin feels dry and I get a headache.,I felt extremely weak and had to take a day of...,I drink more water in the morning and evening....,I have a marked water bottle that shows hourly...,0
1,Nimal_202,11/2/1991,Female,47,Heart Failure,Yes,Very Good,Yes,,Yes,...,Med_Q 25mg,No,I usually work outdoors for about 6 hours. The...,I carry a 1-liter bottle and ensure I refill i...,I haven't noticed any significant changes.,I feel tired and have difficulty concentrating.,I had severe cramps and had to be hospitalized...,I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...,2
2,Rohan_203,5/1/1985,Other,107,None;Hypertension;Diabetes,Yes,Poor,No,Med_A 50mg,Yes,...,Med_Q 25mg,No,I usually work outdoors for about 6 hours. The...,I carry a 1-liter bottle and ensure I refill i...,I haven't noticed any significant changes.,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...,2
3,Rohan_204,11/11/1964,Female,80,Diabetes;Hypertension;Heart Failure,Yes,Good,Yes,Med_A 50mg,Yes,...,Med_R 15mg,No,I usually work outdoors for about 6 hours. The...,I have a smart bottle that reminds me to drink...,I feel more thirsty when I'm on my medication ...,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I have a marked water bottle that shows hourly...,2
4,Priyanka_205,19/2/1966,Male,73,Kidney Disorders,No,Poor,Yes,Med_A 50mg,Yes,...,Med_R 15mg,Yes,"I'm mostly indoors, but I do feel the heat int...","I rely on my thirst. When I feel parched, I dr...",I haven't noticed any significant changes.,My skin feels dry and I get a headache.,"Once, I fainted during a particularly hot day....",I drink water every hour. I also have fruits t...,I use a mobile app that reminds me to drink wa...,1


In [None]:
def most_frequent_non_empty(series):
    if series.dropna().empty:
        return 'N/A'
    else:
        return series.value_counts().idxmax()

cluster_analysis = df.groupby('Predicted Body Water Level Combined').agg({
    'Current Weight (kg)': 'mean',
    'Hydration Habits': most_frequent_non_empty,
    'Feel When Dehydrated': most_frequent_non_empty,
    'Severe Dehydration Experience': most_frequent_non_empty,
})

print(cluster_analysis)

                                     Current Weight (kg)  \
Predicted Body Water Level Combined                        
0                                              80.130435   
1                                              76.062500   
2                                              81.344828   

                                                                      Hydration Habits  \
Predicted Body Water Level Combined                                                      
0                                    I drink more water in the morning and evening....   
1                                    I keep a jug of water on my desk and sip throu...   
2                                    I drink water every hour. I also have fruits t...   

                                                                Feel When Dehydrated  \
Predicted Body Water Level Combined                                                    
0                                    I feel tired and have difficulty co

In [None]:
# prompt: save df as a csv calls temp

X.to_csv('temp.csv')


In [None]:

# Preprocess the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Re-join tokens into a string
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the specified columns
columns_of_interest = [
    'Typical Workday in Heatwave',
    'Determine Water Intake',
    'Medication Effect on Hydration',
    'Feel When Dehydrated',
    'Severe Dehydration Experience',
    'Hydration Habits',
    'Tools for Monitoring Hydration'
]




In [None]:
df['Typical Workday in Heatwave']

0     I'm mostly indoors, but I do feel the heat int...
1     I usually work outdoors for about 6 hours. The...
2     I usually work outdoors for about 6 hours. The...
3     I usually work outdoors for about 6 hours. The...
4     I'm mostly indoors, but I do feel the heat int...
                            ...                        
95    I'm mostly indoors, but I do feel the heat int...
96    I'm mostly indoors, but I do feel the heat int...
97    I usually work outdoors for about 6 hours. The...
98    I'm mostly indoors, but I do feel the heat int...
99    I usually work outdoors for about 6 hours. The...
Name: Typical Workday in Heatwave, Length: 100, dtype: object

In [None]:
# prompt: give me the null value count of columns_of_interest columns

df[columns_of_interest].isnull().sum()


Typical Workday in Heatwave       0
Determine Water Intake            0
Medication Effect on Hydration    0
Feel When Dehydrated              0
Severe Dehydration Experience     0
Hydration Habits                  0
Tools for Monitoring Hydration    0
dtype: int64

In [None]:
# # Concatenate text from all columns into a single text per row
# df['combined_text'] = df[columns_of_interest].apply(lambda x: ' '.join(str(x), axis=1)
# df['preprocessed_text'] = df['combined_text'].apply(preprocess_text)

# # Display an example of the preprocessed text
# df['preprocessed_text'].head()

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.metrics.pairwise import euclidean_distances

# # Vectorize the preprocessed text
# vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
# dtm = vectorizer.fit_transform(df['preprocessed_text_basic'])

# # Fit LDA model
# # Trying with an initial guess of 5 topics
# n_topics = 3
# lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
# lda.fit(dtm)

# # Function to display the top words for each topic
# def display_topics(model, feature_names, no_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print(f"Topic {topic_idx+1}:")
#         print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# no_top_words = 10
# display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)


In [None]:
forcast_df = pd.read_csv('/content/drive/MyDrive/Heat Waves/extended_heatwave_predictions_2024.csv')

In [None]:
forcast_df.head()

Unnamed: 0,Date,Temperature (°C),Humidity (%),Heat Index (°C),Alert Level
0,2024-01-01,37,95,39.594176,High
1,2024-01-02,28,58,29.53405,Low
2,2024-01-03,38,85,40.887715,High
3,2024-01-04,39,63,43.797167,High
4,2024-01-05,30,41,33.227851,Low


In [None]:
# Define the recommendation function
def recommend_work_time(alert_level, body_water_level):

    if alert_level in ["Low", "Moderate"]:
        if body_water_level == 1:  # Low body water level
            return "Avoid peak heat; Stay hydrated"
        else:  # Medium or High body water level
            return "Suitable with caution; Stay hydrated"
    elif alert_level in ["High", "Extreme"]:
        return "Not suitable; Avoid strenuous activities"
    else:
        return "Check local advisories"

# Apply the recommendation function to the heatwave dataset
# For demonstration, using a placeholder for medium body water level (0) for all entries
forcast_df['General Recommendation'] = forcast_df['Alert Level'].apply(
    lambda x: recommend_work_time(x, 0)  # Using 0 as a placeholder for a medium body water level
)

# Display the first few rows of the updated dataset
print(forcast_df.head())

         Date  Temperature (°C)  Humidity (%)  Heat Index (°C) Alert Level  \
0  2024-01-01                37            95        39.594176        High   
1  2024-01-02                28            58        29.534050         Low   
2  2024-01-03                38            85        40.887715        High   
3  2024-01-04                39            63        43.797167        High   
4  2024-01-05                30            41        33.227851         Low   

                     General Recommendation  
0  Not suitable; Avoid strenuous activities  
1      Suitable with caution; Stay hydrated  
2  Not suitable; Avoid strenuous activities  
3  Not suitable; Avoid strenuous activities  
4      Suitable with caution; Stay hydrated  


In [None]:

# Map heatwave alert levels to a general recommendation
forcast_df['General Recommendation'] = forcast_df['Alert Level'].apply(
    lambda x: recommend_work_time(x, 1)  # Using 0 as a placeholder for a medium body water level
)

# Display the first few rows to see the recommendations
forcast_df.head()


Unnamed: 0,Date,Temperature (°C),Humidity (%),Heat Index (°C),Alert Level,General Recommendation
0,2024-01-01,37,95,39.594176,High,Not suitable; Avoid strenuous activities
1,2024-01-02,28,58,29.53405,Low,Avoid peak heat; Stay hydrated
2,2024-01-03,38,85,40.887715,High,Not suitable; Avoid strenuous activities
3,2024-01-04,39,63,43.797167,High,Not suitable; Avoid strenuous activities
4,2024-01-05,30,41,33.227851,Low,Avoid peak heat; Stay hydrated
