In [1]:
import json
from gensim import corpora, models
import re
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import RegexpTokenizer
import spacy


pathway = "/Users/jeremyfeagan/Library/Mobile Documents/com~apple~CloudDocs/MyGitRepo/ChatGPT Project/conversations.json"

# Load the JSON data from the file
with open(pathway, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [19]:
# Custom preprocessing function
def preprocess_text(text):
    # Remove hexadecimal codes and non-ASCII characters
    text = re.sub(r'\b[0-9a-fA-F]{4,}\b', '', text)
    # Keep only words with alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and remove stopwords
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    return filtered_tokens

# Extend STOPWORDS with common programming and mathematical terms if necessary
extended_stopwords = STOPWORDS.union(set(['=', '+', '-', '*', '/', '(', ')', '#', '->', 'int', 'float', 'print', 'def']))

# Initialize a lemmatizer
lemmatizer = WordNetLemmatizer()

# Extract conversation texts from the 'mapping' key
text_data = []
for conversation in data:
    for key, value in conversation['mapping'].items():
        message = value.get('message')
        if message:
            content = message.get('content')
            if content:
                parts = content.get('parts')
                if parts and isinstance(parts, list):  # Ensure 'parts' is a list
                    parts_texts = []
                    for part in parts:
                        if isinstance(part, dict) and 'text' in part:
                            parts_texts.append(part.get('text', ''))
                        elif isinstance(part, str):
                            parts_texts.append(part)
                    combined_text = ' '.join(parts_texts).strip()
                    if combined_text:  # Ensure non-empty string
                        text_data.append(combined_text)

# Apply custom preprocessing to the text data
processed_texts = [preprocess_text(text) for text in text_data]

# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary(processed_texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Limit vocabulary based on word frequency
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Apply LDA Model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20, random_state=100)

# Display identified topics and their top words
topics = lda_model.print_topics(num_words=10)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")

# Manually label topics based on your domain knowledge
topic_labels = {
    0: "Programming",
    1: "Business and Finance",
    2: "Data Analysis",
    3: "Healthcare",
    4: "Literature",
    5: "Data Management",
    6: "Arts and Literature",
    7: "Health and Analysis",
    8: "Mathematics",
    9: "Content and Visualization"
}

# Print the labeled topics
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic_labels[topic_num]} - {topic}")

Topic 0: 0.022*"item" + 0.016*"question" + 0.014*"value" + 0.013*"correct" + 0.012*"answer" + 0.011*"equation" + 0.009*"frac" + 0.009*"sentence" + 0.008*"find" + 0.008*"problem"
Topic 1: 0.010*"provide" + 0.009*"time" + 0.009*"would" + 0.009*"please" + 0.008*"like" + 0.007*"questions" + 0.007*"make" + 0.006*"help" + 0.006*"work" + 0.006*"information"
Topic 2: 0.012*"company" + 0.009*"business" + 0.009*"agreement" + 0.007*"financial" + 0.006*"may" + 0.005*"tax" + 0.005*"assets" + 0.005*"insurance" + 0.004*"information" + 0.004*"llc"
Topic 3: 0.017*"file" + 0.012*"data" + 0.010*"use" + 0.009*"code" + 0.008*"landing" + 0.007*"python" + 0.007*"using" + 0.007*"database" + 0.006*"files" + 0.006*"need"
Topic 4: 0.021*"healthcare" + 0.017*"patients" + 0.016*"patient" + 0.013*"reimbursement" + 0.013*"access" + 0.011*"care" + 0.011*"providers" + 0.009*"specialty" + 0.009*"pharmacy" + 0.007*"medications"
Topic 5: 0.025*"data" + 0.021*"column" + 0.013*"code" + 0.012*"import" + 0.010*"dataframe" + 

In [20]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare the visualization data
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Visualize
pyLDAvis.display(vis_data)


In [5]:
def assign_dominant_topic(lda_model, corpus):
    dominant_topics = []
    for doc in corpus:
        doc_topics = lda_model.get_document_topics(doc)
        doc_topics = sorted(doc_topics, key=lambda x: (x[1]), reverse=True)
        # Get the topic number with highest percentage
        dominant_topic_num, prop_topic = doc_topics[0]
        dominant_topics.append(dominant_topic_num)
    return dominant_topics

dominant_topics = assign_dominant_topic(lda_model, corpus)

# Count the occurrences of each dominant topic
from collections import Counter
topic_counts = Counter(dominant_topics)

# Convert counts to percentages
total_conversations = len(dominant_topics)
topic_percentages = {topic: (count / total_conversations * 100) for topic, count in topic_counts.items()}

print(topic_percentages)


{2: 5.926007975188303, 1: 17.634027470093045, 3: 8.894550287992912, 5: 13.236597252990695, 8: 5.2946389011962784, 7: 12.627381479840496, 0: 7.365972529906956, 6: 13.524590163934427, 9: 7.210899424014178, 4: 8.285334514842711}


In [21]:
from textblob import TextBlob

# Sentiment analysis on extracted texts
sentiments = []
for text in text_data:
    blob = TextBlob(text)
    sentiments.append(blob.sentiment.polarity)  # Polarity ranges from -1 (negative) to 1 (positive)

# You can then average the sentiments or analyze them per conversation/text segment.
sentiments

[0.0,
 0.1388888888888889,
 -0.15555555555555559,
 0.12781385281385285,
 0.16666666666666666,
 0.1307973350526542,
 0.0,
 0.23427128427128424,
 0.13333333333333333,
 0.1040784832451499,
 0.13092581660763478,
 0.180257116620753,
 0.1553820127683764,
 0.16312925170068027,
 0.18521858538048297,
 0.12543924696702474,
 0.11113654105457384,
 0.1773809523809524,
 0.0,
 0.2861344537815126,
 0.021428571428571432,
 -0.003109815354713312,
 0.34215686274509816,
 0.443452380952381,
 0.16573377889167362,
 0.1489285714285714,
 0.16155045351473918,
 0.075,
 0.2333333333333333,
 0.1712023460410557,
 0.0,
 0.19926686217008793,
 0.0,
 0.2222159090909091,
 0.26477272727272727,
 0.3408163265306122,
 0.16793650793650794,
 0.20625000000000004,
 0.14285714285714285,
 0.15,
 0.3833333333333333,
 0.4091666666666666,
 0.3022619047619048,
 0.24696969696969695,
 0.5285714285714286,
 0.2818181818181818,
 0.08590067340067341,
 0.32613636363636367,
 0.3056122448979592,
 0.18541666666666667,
 0.26666666666666666,
 0.0

In [22]:
# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

In [23]:
# Assuming text_data is already populated with combined texts from your code
entities_in_conversations = []

for text in text_data:
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    entities_in_conversations.append(entities)

# At this point, entities_in_conversations contains the entities found in each piece of conversation text

In [24]:
entities_in_conversations

[[('Jackson Hewitt Tax Service', 'ORG')],
 [('Jackson Hewitt Tax Service', 'ORG'),
  ('1', 'CARDINAL'),
  ('2', 'CARDINAL'),
  ('3', 'CARDINAL'),
  ('Upgrades', 'PERSON'),
  ('4', 'CARDINAL'),
  ('5', 'CARDINAL'),
  ('Digital Advertising', 'ORG'),
  ('6', 'CARDINAL'),
  ('7', 'CARDINAL'),
  ('State', 'ORG'),
  ('8', 'CARDINAL'),
  ('9', 'CARDINAL'),
  ('10', 'CARDINAL'),
  ('11', 'CARDINAL'),
  ('Jackson Hewitt Tax Service', 'ORG')],
 [('one', 'CARDINAL')],
 [('one', 'CARDINAL'),
  ('Microsoft', 'ORG'),
  ('Excel', 'PRODUCT'),
  ('Google Sheets', 'ORG'),
  ('###', 'MONEY'),
  ('Google Sheets:*', 'ORG'),
  ('1', 'CARDINAL'),
  ('2', 'CARDINAL'),
  ('Data Validation:*', 'ORG'),
  ('3', 'CARDINAL'),
  ('commas', 'PERSON'),
  ('Payroll Expenses', 'ORG'),
  ('4', 'CARDINAL'),
  ('Invalid Data:*', 'ORG'),
  ('5', 'CARDINAL'),
  ('6', 'CARDINAL'),
  ('Save', 'PRODUCT'),
  ('Microsoft', 'ORG'),
  ('Excel', 'PRODUCT'),
  ('1', 'CARDINAL'),
  ('2', 'CARDINAL'),
  ('Data Validation:*', 'ORG'),
  

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming `text_data` is a list of text segments from your conversations
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text_data)

# Get feature names
feature_names = vectorizer.get_feature_names_out()
