In [1]:
import json
from gensim import corpora, models
import re
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import RegexpTokenizer


pathway = "/Users/jeremyfeagan/Library/Mobile Documents/com~apple~CloudDocs/MyGitRepo/ChatGPT Project/conversations.json"

# Load the JSON data from the file
with open(pathway, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [2]:
# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremyfeagan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Custom preprocessing function
def preprocess_text(text):
    # Remove hexadecimal codes and non-ASCII characters
    text = re.sub(r'\b[0-9a-fA-F]{4,}\b', '', text)
    # Keep only words with alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    return filtered_tokens

In [5]:
# Extend STOPWORDS with common programming and mathematical terms if necessary
extended_stopwords = STOPWORDS.union(set(['=', '+', '-', '*', '/', '(', ')', '#', '->', 'int', 'float', 'print', 'def']))

# Initialize a lemmatizer
lemmatizer = WordNetLemmatizer()
# Extract conversation texts from the 'mapping' key
text_data = []
for conversation in data:
    for key, value in conversation['mapping'].items():
        message = value.get('message')
        if message:
            content = message.get('content')
            if content:
                parts = content.get('parts')
                if parts and isinstance(parts, list):  # Ensure 'parts' is a list
                    parts_texts = []
                    for part in parts:
                        if isinstance(part, dict) and 'text' in part:
                            parts_texts.append(part.get('text', ''))
                        elif isinstance(part, str):
                            parts_texts.append(part)
                    combined_text = ' '.join(parts_texts).strip()
                    if combined_text:  # Ensure non-empty string
                        text_data.append(combined_text)

In [6]:
# Assuming 'text_data' is a list of your conversation texts
processed_texts = [preprocess_text(text) for text in text_data]

# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary(processed_texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Limit vocabulary based on word frequency
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Apply LDA Model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20, random_state=100)

# Display identified topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.022*"item" + 0.016*"question" + 0.014*"value" + 0.013*"correct" + 0.012*"answer"')
(1, '0.010*"provide" + 0.009*"time" + 0.009*"would" + 0.009*"please" + 0.008*"like"')
(2, '0.012*"company" + 0.009*"business" + 0.009*"agreement" + 0.007*"financial" + 0.006*"may"')
(3, '0.017*"file" + 0.012*"data" + 0.010*"use" + 0.009*"code" + 0.008*"landing"')
(4, '0.021*"healthcare" + 0.017*"patients" + 0.016*"patient" + 0.013*"reimbursement" + 0.013*"access"')
(5, '0.025*"data" + 0.021*"column" + 0.013*"code" + 0.012*"import" + 0.010*"dataframe"')
(6, '0.008*"american" + 0.007*"literature" + 0.005*"world" + 0.005*"african" + 0.004*"literary"')
(7, '0.024*"health" + 0.013*"data" + 0.008*"may" + 0.007*"analysis" + 0.006*"valuebased"')
(8, '0.015*"number" + 0.013*"year" + 0.013*"total" + 0.011*"rate" + 0.010*"calculate"')
(9, '0.014*"content" + 0.014*"visual" + 0.011*"use" + 0.010*"text" + 0.010*"information"')


In [7]:
# Display identified topics and their top words
topics = lda_model.print_topics(num_words=10)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")

# Manually label topics based on your domain knowledge
topic_labels = {
    0: "Programming",
    1: "Business and Finance",
    2: "Data Analysis",
    3: "Healthcare",
    4: "Literature",
    5: "Data Management",
    6: "Arts and Literature",
    7: "Health and Analysis",
    8: "Mathematics",
    9: "Content and Visualization"
}

# Print the labeled topics
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic_labels[topic_num]} - {topic}")

# Additional refinement can be done based on your manual inspection and domain knowledge.

Topic 0: 0.022*"item" + 0.016*"question" + 0.014*"value" + 0.013*"correct" + 0.012*"answer" + 0.011*"equation" + 0.009*"frac" + 0.009*"sentence" + 0.008*"find" + 0.008*"problem"
Topic 1: 0.010*"provide" + 0.009*"time" + 0.009*"would" + 0.009*"please" + 0.008*"like" + 0.007*"questions" + 0.007*"make" + 0.006*"help" + 0.006*"work" + 0.006*"information"
Topic 2: 0.012*"company" + 0.009*"business" + 0.009*"agreement" + 0.007*"financial" + 0.006*"may" + 0.005*"tax" + 0.005*"assets" + 0.005*"insurance" + 0.004*"information" + 0.004*"llc"
Topic 3: 0.017*"file" + 0.012*"data" + 0.010*"use" + 0.009*"code" + 0.008*"landing" + 0.007*"python" + 0.007*"using" + 0.007*"database" + 0.006*"files" + 0.006*"need"
Topic 4: 0.021*"healthcare" + 0.017*"patients" + 0.016*"patient" + 0.013*"reimbursement" + 0.013*"access" + 0.011*"care" + 0.011*"providers" + 0.009*"specialty" + 0.009*"pharmacy" + 0.007*"medications"
Topic 5: 0.025*"data" + 0.021*"column" + 0.013*"code" + 0.012*"import" + 0.010*"dataframe" + 

In [9]:
from gensim import corpora, models

# Prepare the text data (replace 'text_data' with your dataset)
processed_texts = [text.split() for text in text_data]

# Create a dictionary and document-term matrix
dictionary = corpora.Dictionary(processed_texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Filter out infrequent and very common words
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_texts]

# Train an LDA topic model
lda_model = models.LdaModel(doc_term_matrix, num_topics=10, id2word=dictionary, passes=15)

# Generate topic summaries
topic_summaries = {}
for topic_id in range(10):
    topic_words = lda_model.show_topic(topic_id, topn=10)  # Get the top 10 words for each topic
    topic_keywords = [word for word, _ in topic_words]
    topic_summary = ', '.join(topic_keywords)
    topic_summaries[f'Topic {topic_id}'] = topic_summary

topic_summaries


{'Topic 0': '-, +, =, \\(, \\item, we, 2, can, value, \\)',
 'Topic 1': 'The, with, as, by, American, on, African, its, that, into',
 'Topic 2': 'that, it, as, was, not, be, The, with, this, are',
 'Topic 3': 'with, -, healthcare, patient, •, reimbursement, access, that, patients, as',
 'Topic 4': '=, #, import, if, ==, 0, 1, str,, as, return',
 'Topic 5': 'that, can, health, on, The, with, as, or, are, be',
 'Topic 6': 'data, you, that, can, with, The, your, or, as, ```',
 'Topic 7': 'I, my, you, your, me, am, have, this, with, that',
 'Topic 8': '|, NaN, The, on, its, with, by, financial, will, rate',
 'Topic 9': 'your, or, -, you, can, on, with, be, that, are'}