In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
import pandas as pd

df = pd.read_csv('train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,label,polarity
0,0,This computer is absolutely AMAZING!!!,GENERAL,positive
1,1,10 plus hours of battery...,OPERATION_PERFORMANCE,positive
2,2,super fast processor and really nice graphics ...,OPERATION_PERFORMANCE,positive
3,3,and plenty of storage with 250 gb(though I wil...,DESIGN_FEATURES,positive
4,4,This computer is really fast and I'm shocked a...,OPERATION_PERFORMANCE,positive


In [6]:
text_data = []
for sentence in df['tweet']:
    tokens = prepare_text_for_lda(sentence)
    text_data.append(tokens)

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [21]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics()

In [33]:
all_topics = []
for topic in topics:
    for pair in topic[1].split(' + '):
        word = pair.split('*')[1].replace("\"", '')
        all_topics.append(word)

In [60]:
def aspect_topic(aspect, all_topics):
    count = 0
    res = []
    for sentence in df['tweet']:
        term = []
        label = df['label'][count]
        for topic in all_topics:
            if topic in sentence:
                term.append(topic)
                
                if label == aspect:
                    res.append(topic)
        count += 1
    return list(dict.fromkeys(res))

In [68]:
g = aspect_topic('GENERAL', list(dict.fromkeys(all_topics)))
l = aspect_topic('LAIN', list(dict.fromkeys(all_topics)))
d = aspect_topic('DESIGN_FEATURES', list(dict.fromkeys(all_topics)))
p = aspect_topic('OPERATION_PERFORMANCE', list(dict.fromkeys(all_topics)))
q = aspect_topic('QUALITY', list(dict.fromkeys(all_topics)))

In [69]:
import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt
import venn

labels = venn.get_labels([g, l, d, p, q])
fig, ax = venn.venn5(labels, names=['GENERAL', 'LAIN', 'DESIGN_FEATURES', 'OPERATION_PERFORMANCE', 'QUALITY'])
fig.savefig('venn5.png', bbox_inches='tight')
plt.close()

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 478, in start
    self.io_loop.start()
  File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(