## !!! Before committing and pushing to the repo:
please "Clear All" jupyter outputs. this is important in order to clean the repo clean and in order to ensure privacy (not to upload data in output cells to the public repo.)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import time
from pathlib import Path
import pandas as pd
from pathlib import Path

cwd = os.getcwd()
root = Path(cwd).parent
src_path = root/'src' 
sys.path.insert(0, str(src_path))

In [None]:
pd.set_option('display.max_columns', 100)

In [None]:
import nltk
nltk.download('punkt_tab', download_dir=root/'data'/'nltk_data')


In [None]:
# prepare input/output folder paths

data_folder = root/'data'/'questionnaire-data-july-1-2024'
assert data_folder.exists(), f'{data_folder=} is missing.'

experiement_name = f'experiment_{int(time.time())}'
main_output_folder = data_folder/'output'
main_output_folder.mkdir(parents=False, exist_ok=True)
output_folder = main_output_folder/experiement_name
output_folder.mkdir(parents=False, exist_ok=True)
print(f'New output folder created:\t{str(output_folder)}')

In [None]:
from preprocessing.data_loader import basic_data_loader
dfs, text_col_names = basic_data_loader(data_folder/'raw')

In [None]:
from preprocessing.sentence_splitting import explode_sentences
from modeling.topic_modeling import fit_transform_model_topic
from modeling.LLM_queries import summarize_topic
import re

def topic_model_and_summarize_column(col_: pd.Series, save_to_folder: str|Path, verbose=0):
    col_name = re.sub(r'\W+', ' ', col_.name)
    col_name.strip()
    hierarchical_plot_file = save_to_folder/col_name
    sentences_col = explode_sentences(col_)
    grouped_by_topics, topic_model = fit_transform_model_topic(sentences_col, verbose=verbose, hierarchical_plot_file=hierarchical_plot_file)
    grouped_by_topics['summary'] = grouped_by_topics['list'].apply(lambda l : summarize_topic(col_.name, l, context_length=2500, verbose=verbose))
    grouped_by_topics['summary_content'] = grouped_by_topics['summary'].apply(lambda x: x['message']['content'].replace('\n',''))
    return grouped_by_topics


In [None]:
import time
responses = dict()
for qstr_key, df in dfs.items():
    print(qstr_key)
    qstr_responses = dict()
    for col_name in text_col_names[qstr_key]:
        col = df[col_name].dropna()
        start = time.time()
        col_responses = topic_model_and_summarize_column(col, output_folder, verbose=7)
        qstr_responses[col_name] = col_responses
    responses[qstr_key] = qstr_responses

# Save/Load results to pickle or google spreadsheet

In [None]:
import pickle

### Save to pickle
with open(output_folder/'responses.pkl', 'wb') as file:
    pickle.dump(responses, file)

###  Load from pickle
# with open(output_folder/'responses.pkl', 'rb') as file:
#     loaded_dfs = pickle.load(file)


In [None]:
for qstr_key, v in responses.items():
    for question, summary in v.items():
        summary['summary_content'] = summary['summary'].apply(lambda x: x['message']['content'].replace('\n',''))

In [None]:
# import gspread
# from oauth2client.service_account import ServiceAccountCredentials
# import pandas as pd
# from gspread_dataframe import set_with_dataframe

# def push_csv_to_google_spreadsheet2(credentials_json_path, df, spreadsheet_name, sheet_name):
#     """
#     Push data from multiple CSV files to different sheets in a Google Spreadsheet.

#     :param credentials_json_path: Path to the JSON credentials file for the Google Service Account.
#     :param spreadsheet_name: Name of the Google Spreadsheet.
#     :param csv_file_paths: List of paths to CSV files.
#     """
#     # Define the scope
#     scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]

#     # Add your service account credentials
#     creds = ServiceAccountCredentials.from_json_keyfile_name(credentials_json_path, scope)

#     # Authorize the client
#     client = gspread.authorize(creds)

#     # Open the spreadsheet
#     spreadsheet = client.open(spreadsheet_name)

#     # Create a new sheet with the name derived from the CSV file
#     clean_sheet_name = re.sub(r'\W+', ' ', sheet_name)
#     short_sheet_name = clean_sheet_name[:30]
#     try:
#         sheet = spreadsheet.add_worksheet(title=short_sheet_name, rows=df.shape[0], cols=df.shape[1])
#     except gspread.exceptions.APIError:
#         # If the sheet already exists, get the existing sheet
#         sheet = spreadsheet.worksheet(short_sheet_name)


#     # Convert DataFrame to list of lists
#     # data = [df.columns.values.tolist()] + df.values.tolist()

#     # Clear the existing sheet content (if any)
#     sheet.clear()

#     sheet.append_row([sheet_name])
#     set_with_dataframe(sheet, df, row=3)

#     print("Data has been successfully pushed to the spreadsheet.")



In [None]:

# spreadsheet_name = experiement_name
# credentials_json_path = '/Users/sl/dev/madrase/madrase-questionnaire-july-22-51ed22398a1d.json'

# for qstr_key, v in responses.items():
#     for question, summary in v.items():
#         cols_to_csv = ['num_samples', 'num_unique', 'Count', 'Name', 'Representative_Docs', 'keywords' , 'summary_content']
#         # summary[cols_to_csv].to_csv(output_folder/f'{question}.csv')
#         worksheet_name = '.'.join([qstr_key,question])
#         summary[['Representative_Docs', 'keywords']] = summary[['Representative_Docs', 'keywords']].astype(str)
#         push_csv_to_google_spreadsheet2(credentials_json_path, 
#                                         summary[cols_to_csv], 
#                                         spreadsheet_name, 
#                                         worksheet_name)
