In [None]:
# SENTIMENT ANALYSIS TO QA PIPELINE
# to find out the reason for a sentiment

from transformers import pipeline

# Load sentiment analysis and question answering models
sa_model = pipeline("sentiment-analysis")
qa_model = pipeline("question-answering")

# Review to analyze
review = "I had a great experience with this product. The customer service was excellent and the product exceeded my expectations."

# Perform sentiment analysis on the review
sentiment = sa_model(review)[0]['label']

# Extract key phrases from the review
key_phrases = ['customer service', 'product', 'expectations']

# Generate questions based on the sentiment and key phrases
if sentiment == 'POSITIVE':
    questions = ['What made the customer service excellent?',
                 'What features of the product exceeded your expectations?']
elif sentiment == 'NEGATIVE':
    questions = ['What issues did you encounter with the customer service?',
                 'What aspects of the product were disappointing?']
else:
    questions = ['What can you say about the customer service?',
                 'What can you say about the product?']

# Use QA to answer the questions
answers = []
for question in questions:
    answer = qa_model(question=question, context=review)
    answers.append(answer['answer'])

# Print the sentiment and answers
print('Sentiment:', sentiment)
print('Answers:', answers)

In [None]:
# TOPIC MODELING

!pip install sentence-transformers  # Install the required library

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation

# Load BioBERT model
biobert = SentenceTransformer('gsarti/biobert-nli')

# Load the text data
df = pd.read_csv('path/to/data.csv')

# Convert the text data to sentence embeddings
sentences = df['text'].tolist()
embeddings = biobert.encode(sentences)

# Perform topic modeling using Latent Dirichlet Allocation
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(embeddings)

# Print the topics and the top words in each topic
for idx, topic in enumerate(lda.components_):
    print('Topic %d:' % (idx))
    print(' '.join([biobert.decode([feature]) for feature in topic.argsort()[:-10 - 1:-1]]))
    print()

In [85]:
# KEYWORD EXTRACTION

doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input 
      """

from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
print(type(distances))
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(len(distances[0]))



<class 'numpy.ndarray'>
7


In [14]:
distances.ndim

2

In [3]:
import pandas as pd
comments=pd.read_excel('comments.xlsx')

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

n_gram_range = (1, 1)
stop_words = "english"

sentences=comments['Comments'].values.tolist()
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(sentences)
candidates = count.get_feature_names()



In [86]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)
doc_embedding
comments['Doc_Embed']=comments.apply(lambda x: model.encode([x['Comments']]),axis=1)
comments['Distance']=comments.apply(lambda x:cosine_similarity(x['Doc_Embed'],candidate_embeddings),axis=1)
top_n = 5
# distances = cosine_similarity(doc_embedding, candidate_embeddings)
# keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [87]:
comments['Keywords']=comments['Distance'].apply(lambda x: print(x))

[[-0.03689715]
 [ 0.00377444]
 [ 0.21624918]
 [-0.0212972 ]
 [ 0.03549244]
 [ 0.12607025]
 [ 0.01167195]]
[[0.3550511 ]
 [0.30662364]
 [0.4605971 ]
 [0.36754215]
 [0.3787705 ]
 [0.5830076 ]
 [0.3822926 ]]
[[0.06624319]
 [0.08109218]
 [0.26776776]
 [0.07373106]
 [0.14842369]
 [0.26374188]
 [0.11118844]]
[[0.38388085]
 [0.3735308 ]
 [0.51662016]
 [0.37933713]
 [0.32125825]
 [0.6105641 ]
 [0.48755568]]
[[0.07437953]
 [0.10615816]
 [0.2765436 ]
 [0.08318146]
 [0.13505548]
 [0.25954688]
 [0.10121397]]
[[0.37491083]
 [0.46937156]
 [0.5838687 ]
 [0.3889848 ]
 [0.40230277]
 [0.4974962 ]
 [0.50688106]]
[[0.01981166]
 [0.02514253]
 [0.2078799 ]
 [0.04686979]
 [0.11466895]
 [0.22064799]
 [0.07252524]]
[[0.51451814]
 [0.52096224]
 [0.5859443 ]
 [0.5054382 ]
 [0.47318393]
 [0.6962565 ]
 [0.57322437]]
[[0.11411806]
 [0.12221289]
 [0.30409703]
 [0.13269892]
 [0.18024251]
 [0.32534862]
 [0.13431072]]
[[0.32242846]
 [0.3150214 ]
 [0.44536775]
 [0.29361004]
 [0.29850075]
 [0.46683782]
 [0.35305566]]
[[0

In [83]:
comments['Keywords']

0     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
1     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
2     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
3     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
4     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
5     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
6     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
7     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
8     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
9     [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
10    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
11    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
12    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
13    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
14    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
15    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
16    [[0], [0], [0], [0], [0], [0], [0], [0], [0], ...
17    [[0], [0], [0], [0], [0], [0], [0], [0], [

In [63]:
comments['a']=comments['Distance'].apply(lambda x: print(type(x)))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
from sklearn.decomposition import NMF

# Assume that documents is a list of text documents
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(documents)

# Initialize the NMF object with 100 components
nmf = NMF(n_components=100, init='nndsvd', max_iter=200)

# Factorize the term-document matrix into a word-topic matrix and a topic-document matrix
W = nmf.fit_transform(tfidf)

# The resulting word embeddings are the rows of the word-topic matrix
word_embeddings = W.T