# generate word clouds to see frequent words/topics to determine stop words

In [1]:
import nltk
import json
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud
from collections import defaultdict
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
stop_words = set(stopwords.words('english'))

# words used in the filter
filterWords = {"tesla", "elon", "musk", "elonmusk", "tsla", "roadster", "supercharger", "powerwall", "powerpack", "modely",
               "model3", "modelx", "teslamodely", "teslamodels", "teslamodel3", "teslamodelx", "spacex",
               "teslasuv", "teslascience"}

# bigrams used in the filter
filterBigrams = {("model", "y"), ("model", "s"), ("model", "3"), ("model", "x"), ("electric", "vehicle"),
                 ("electric", "car"), ("electric", "suv"), ("electric", "supercar")}

# stop words found using the word cloud (manual extraction)
cloud = {'year', 'amp', 'us', 'at_tesla'}

# add filter words to the stop words
for word in filterWords:
    stop_words.add(word)
    
# add filter bigrams to the stop words
for bi1, bi2 in filterBigrams:
    stop_words.add(bi1)
    stop_words.add(bi2)
    
# add cloud to the stop words
for word in cloud:
    stop_words.add(word)

counts = defaultdict(int)

handle = open('../classify.json', encoding='utf8')

for line in handle:
    item = json.loads(line)
    if item['isRelevant'] == 1:  # get relevant tweets
        tokens = tokenizer.tokenize(item['text'].lower())
        for token in tokens:
            if token not in stop_words:
                counts[token] += 1
        
handle.close()

In [None]:
wc = WordCloud(background_color="white", max_words=100, width=1000, height=500)
wc.generate_from_frequencies(counts)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# print the most common words
counts2 = []
for i, j in counts.items():
    counts2.append((i,j))
counts2.sort(key=lambda x:x[1], reverse=True)
for i in counts2:
    print(i)

# look at common words in irrelevant tweets

In [None]:
irr = defaultdict(int)

handle = open('../classify.json', encoding='utf8')

for line in handle:
    item = json.loads(line)
    if item['isRelevant'] == 0:  # irrelevant tweets
        tokens = tokenizer.tokenize(item['text'].lower())
        for token in tokens:
            if token not in stop_words:
                irr[token] += 1
        
handle.close()

In [None]:
wc = WordCloud(background_color="white", max_words=100, width=1000, height=500)
wc.generate_from_frequencies(irr)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

# look at common words in negative tweets

In [None]:
neg = defaultdict(int)

handle = open('../classify.json', encoding='utf8')

for line in handle:
    item = json.loads(line)
    if item['isRelevant'] == 1 and item['sentiment'] == 0:  # get relevant tweets
        var += 1
        tokens = tokenizer.tokenize(item['text'].lower())
        for token in tokens:
            if token not in stop_words:
                neg[token] += 1
handle.close()

In [None]:
wc = WordCloud(background_color="white", max_words=100, width=1000, height=500)
wc.generate_from_frequencies(neg)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

# look at positive words in tweets

In [None]:
pos = defaultdict(int)

handle = open('../classify.json', encoding='utf8')

for line in handle:
    item = json.loads(line)
    if item['isRelevant'] == 1 and item['sentiment'] == 2:  # get relevant tweets
        var += 1
        tokens = tokenizer.tokenize(item['text'].lower())
        for token in tokens:
            if token not in stop_words:
                pos[token] += 1
handle.close()

In [None]:
wc = WordCloud(background_color="white", max_words=100, width=1000, height=500)
wc.generate_from_frequencies(pos)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()