In [None]:
# SENTIMENT ANALYSIS TO QA PIPELINE
# to find out the reason for a sentiment

from transformers import pipeline

# Load sentiment analysis and question answering models
sa_model = pipeline("sentiment-analysis")
qa_model = pipeline("question-answering")

# Review to analyze
review = "I had a great experience with this product. The customer service was excellent and the product exceeded my expectations."

# Perform sentiment analysis on the review
sentiment = sa_model(review)[0]['label']

# Extract key phrases from the review
key_phrases = ['customer service', 'product', 'expectations']

# Generate questions based on the sentiment and key phrases
if sentiment == 'POSITIVE':
    questions = ['What made the customer service excellent?',
                 'What features of the product exceeded your expectations?']
elif sentiment == 'NEGATIVE':
    questions = ['What issues did you encounter with the customer service?',
                 'What aspects of the product were disappointing?']
else:
    questions = ['What can you say about the customer service?',
                 'What can you say about the product?']

# Use QA to answer the questions
answers = []
for question in questions:
    answer = qa_model(question=question, context=review)
    answers.append(answer['answer'])

# Print the sentiment and answers
print('Sentiment:', sentiment)
print('Answers:', answers)

In [None]:
# TOPIC MODELING

!pip install sentence-transformers  # Install the required library

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation

# Load BioBERT model
biobert = SentenceTransformer('gsarti/biobert-nli')

# Load the text data
df = pd.read_csv('path/to/data.csv')

# Convert the text data to sentence embeddings
sentences = df['text'].tolist()
embeddings = biobert.encode(sentences)

# Perform topic modeling using Latent Dirichlet Allocation
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(embeddings)

# Print the topics and the top words in each topic
for idx, topic in enumerate(lda.components_):
    print('Topic %d:' % (idx))
    print(' '.join([biobert.decode([feature]) for feature in topic.argsort()[:-10 - 1:-1]]))
    print()

In [85]:
# KEYWORD EXTRACTION

doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input 
      """

from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
print(type(distances))
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(len(distances[0]))



<class 'numpy.ndarray'>
7


In [None]:
from sklearn.decomposition import NMF

# Assume that documents is a list of text documents
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(documents)

# Initialize the NMF object with 100 components
nmf = NMF(n_components=100, init='nndsvd', max_iter=200)

# Factorize the term-document matrix into a word-topic matrix and a topic-document matrix
W = nmf.fit_transform(tfidf)

# The resulting word embeddings are the rows of the word-topic matrix
word_embeddings = W.T