In [9]:
# nltk.download('twitter_samples')
# nltk.download('punkt')
# nltk.download('wordnet') #lexical db for english language that helps script determine base word 
# nltk.download('averaged_perceptron_tagger') #determines the context of a word in a sentence 
# nltk.download('stopwords')

In [2]:
import nltk
import re, string, random
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [3]:
"""Interface"""

stop_words = stopwords.words('english')

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9963333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2017.3 : 1.0
                     sad = True           Negati : Positi =     24.8 : 1.0
                follower = True           Positi : Negati =     23.9 : 1.0
                followed = True           Negati : Positi =     23.4 : 1.0
                  arrive = True           Positi : Negati =     20.2 : 1.0
                    glad = True           Positi : Negati =     19.1 : 1.0
                     bam = True           Positi : Negati =     19.1 : 1.0
                    blog = True           Positi : Negati =     14.9 : 1.0
               community = True           Positi : Negati =     14.9 : 1.0
                     via = True           Positi : Negati =     14.0 : 1.0
None


In [4]:
#freq distribution test
all_pos_words = get_all_words(negative_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_pos.most_common(10)

[(':(', 4585),
 (':-(', 501),
 ("i'm", 343),
 ('...', 332),
 ('get', 325),
 ('miss', 291),
 ('go', 275),
 ('please', 275),
 ('want', 246),
 ('like', 218)]

In [170]:
"""Scraper"""
# import twint

# c = twint.Config()
# c.Username = "elonmusk"
# c.Search = "tesla"
# c.Limit = 2000
# c.Store_csv = True
# c.Output = "elonmusk_tesla.csv"


# twint.run.Search(c)

In [11]:
import pandas as pd
pd.set_option('max_colwidth', 150)

elonTwts = pd.read_csv("elonmusk_tesla.csv")
nullClms = elonTwts.columns[elonTwts.isna().any()].tolist() #null columns in list 
#elonTwts.columns.values
elonTwts = elonTwts.drop(nullClms,axis=1) #dropping null columns 
#elonTwts.info()

columnsMain = ['user_id','username','date','time','name','tweet','photos','timezone',
               'mentions','replies_count','retweets_count','likes_count','reply_to']
columnsTest = ['user_id','username','date','time','replies_count',
               'retweets_count','likes_count','tweet']

elonTwts = elonTwts[columnsTest]
elonTwts.head()

Unnamed: 0,user_id,username,date,time,replies_count,retweets_count,likes_count,tweet
0,44196397,elonmusk,2020-05-10,00:23:20,228,118,4119,Exactly
1,44196397,elonmusk,2020-05-09,16:17:07,2457,5205,31262,"I’m not messing around. Absurd & medically irrational behavior in violation of constitutional civil liberties, moreover by *unelected* county offi..."
2,44196397,elonmusk,2020-05-09,15:43:26,196,155,5505,"Much appreciated, Mayor Fine!"
3,44196397,elonmusk,2020-05-09,10:18:14,608,932,10070,Tesla is the biggest manufacturer in California & second biggest exporter
4,44196397,elonmusk,2020-05-09,10:14:28,359,537,4599,Exactly! Tesla knows far more about what needs to be done to be safe through our Tesla China factory experience than an (unelected) interim junior...


In [19]:
elonTwts['tokenized'] = [word_tokenize(tweet) for tweet in elonTwts['tweet']]
elonTwts['cleaned'] = [remove_noise(tweet,stop_words) for tweet in elonTwts['tokenized']]
elonTwts['sentiment'] = [classifier.classify(dict([token, True] for token in tweet)) for tweet in elonTwts['cleaned']]    

# elonTwts[['tweet','tokenized','cleaned','sentiment']].head()
elonTwts[['tweet','cleaned','sentiment']].head()
#elonTwts[['tweet','tokenized','cleaned']].loc[1]

Unnamed: 0,tweet,cleaned,sentiment
0,Exactly,[exactly],Positive
1,"I’m not messing around. Absurd & medically irrational behavior in violation of constitutional civil liberties, moreover by *unelected* county offi...","[’, mess, around, absurd, medically, irrational, behavior, violation, constitutional, civil, liberty, moreover, *unelected*, county, official, acc...",Negative
2,"Much appreciated, Mayor Fine!","[much, appreciate, mayor, fine]",Positive
3,Tesla is the biggest manufacturer in California & second biggest exporter,"[tesla, big, manufacturer, california, second, big, exporter]",Positive
4,Exactly! Tesla knows far more about what needs to be done to be safe through our Tesla China factory experience than an (unelected) interim junior...,"[exactly, tesla, know, far, need, safe, tesla, china, factory, experience, unelected, interim, junior, official, alameda, county]",Positive


In [109]:
elonTwts['sentiment'].value_counts()

Positive    801
Negative    598
Name: sentiment, dtype: int64

In [None]:
#model test
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
custom_tokens = remove_noise(word_tokenize(custom_tweet))

etweet = elonTwts['cleaned'][2]
etoken = remove_noise(etweet)

print(etweet)
print(classifier.classify(dict([token, True] for token in etweet)))

In [20]:
#freq distribution test
all_pos_words = get_all_words(elonTwts['cleaned'])
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_pos.most_common(10)

[('tesla', 904),
 ('’', 313),
 ('http', 217),
 ('…', 207),
 ('car', 206),
 ('model', 106),
 ('year', 98),
 ('good', 89),
 ('work', 83),
 ('make', 82)]