In [2]:
import pandas as pd
import numpy as np

clean_data = "../data/clean_modeling_data.csv"
data = pd.read_csv(clean_data)
docs = data["comment"]

In [1]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli")
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)
# output
#>>> {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use mps:0


{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5227585434913635,
  0.4581397473812103,
  0.014264755882322788,
  0.0026849983260035515,
  0.0021520687732845545]}

In [4]:
labels = ["Advisor", "Seeker"]

docs[0]

"So it appears Tesla found a bug in their app after watching this video that accidentally counted all production and usage totals as DOUBLE their actual numbers. Doesn't affect my payback timeline calculations, but it does make more sense that I was seeing ~4000kWh of production in a month, not 8000 🤓"

In [13]:
pipe(docs[3],labels)

{'sequence': 'Yes, you cannot produce 350 kWh of energy in a day with a ~40kW panel array. At your latitude, four times kWp in the best case scenario makes more sense.',
 'labels': ['Advisor', 'Seeker'],
 'scores': [0.5359474420547485, 0.4640524983406067]}

In [16]:
import random

def classify_comment(comment):
    result = pipe(comment, labels)
    return result['labels'][0]  # Top label (most likely role)

# Vectorize the function
vectorized_classifier = np.vectorize(classify_comment)

# Apply classification to the first 100 comments
print("Classifying comments...")

sampled_docs = random.sample(list(docs), 1000)
predicted_roles = vectorized_classifier(np.array(sampled_docs))

# Count results
unique, counts = np.unique(predicted_roles, return_counts=True)
role_stats = dict(zip(unique, counts))

# Compute percentages
total = sum(counts)
advisor_pct = (role_stats.get("Advisor", 0) / total) * 100
seeker_pct = (role_stats.get("Seeker", 0) / total) * 100

# Display
print(f"\nTotal classified comments: {total}")
print(f"Advisor: {role_stats.get('Advisor', 0)} ({advisor_pct:.1f}%)")
print(f"Seeker: {role_stats.get('Seeker', 0)} ({seeker_pct:.1f}%)")

Classifying comments...

Total classified comments: 1000
Advisor: 680 (68.0%)
Seeker: 320 (32.0%)
