# NLP on Covid Narratives using QRMine

## Data
* The dataset used is a collection of 115 interview transcripts available at https://covidstories.omeka.net/


In [None]:
# First install qrmine
# Uncomment the line below to install qrmine, if not already installed
#!pip install qrmine

In [None]:
# Now let us read the data
from qrmine import ReadData

# Read the data
corpus = ReadData()
words_to_ignore = "interviewee,interviewer,like,go,um,covid,COVID,think,said,know,feel,yeah,uh,just,start"
corpus.read_file("/home/beapen/Documents/covid/", words_to_ignore)

In [None]:
# Let us create content processor, nlp processor and clustere processor

from qrmine import Content
from qrmine import Qrmine
from qrmine import ClusterDocs

content = Content()
nlp = Qrmine()
cluster = ClusterDocs(content)

In [None]:
# Now let us create a coding dictionary with 10 categories
categories = 10
all_interviews = Content(corpus.content, corpus.titles)
nlp.print_dict(all_interviews, categories)

In [None]:
# Now let us generate topics with LDA
cluster.documents = corpus.documents
cluster.titles = corpus.titles
topics = cluster.print_topics()

In [None]:
# Let us see the documents belonging to each topic
cluster.print_clusters()

In [None]:
# Format
df_dominant_topic = cluster.format_topics_sentences(visualize=True)
# # Format the output
df_dominant_topic.columns = [
    "Document_No",
    "Title",
    "Dominant_Topic",
    "Topic_Perc_Contrib",
    "Keywords",
    "Text",
]
print(df_dominant_topic.head(5))

In [None]:
# And the most representative document for each topic
df_sorted = cluster.most_representative_docs()
print(df_sorted.head(5))

In [None]:
from qrmine import QRVisualize
# Now let us visualize the frequency distribution of words
v = QRVisualize(df_dominant_topic)
v.plot_distribution_by_topic()

In [None]:
topics = cluster.build_lda_model()
v.plot_wordcloud(topics=topics)

In [None]:
data = cluster.vectorizer(docs=corpus.documents, titles=corpus.titles)
v.cluster_chart(data)

## Now let us identify the sentiment of few interview transcripts using QRMine. We can also filter the results based on the sentiment score.

In [None]:
from qrmine import Sentiment
s = Sentiment()


for title, doc in zip(corpus.titles[:5], corpus.documents[:5]):
    # Get the sentiment score
    print(f"Title: {title}")
    s.get_sentiment(doc, tags=[], sentence=False, verbose=True)
