In [36]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import tweepy
import pandas as pd
import re, string, random
import plotly.graph_objects as go
import csv

nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []
    
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
   

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    #print(freq_dist_pos.most_common(10))
    
    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    
    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]
    
    dataset = positive_dataset + negative_dataset 

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(25))
    
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


search_words = '#covid19 OR corona -filter:retweets' 
date_since = "2020-06-15"
date_till = "2020-06-16"

tweets = tweepy.Cursor(api.search,
                      q=search_words,
                       geocode="7.2844443,80.637474,175mi", #Kandy, Srilanka
                      lang="en",
                      since=date_since,until = date_till, tweet_mode='extended',
                      include_rts=True).items()    
df = pd.DataFrame([tweet.full_text for tweet in tweets], columns=['tweets'])
df_loc = pd.DataFrame([tweet.user.location for tweet in tweets], columns=['location'])
p=0
n=0

for i, j in df.iterrows(): 
    custom_tokens = remove_noise(word_tokenize(j.to_string()))
    sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
    print(sentiment,j)
    if sentiment=="Positive": p+=1
    else: n+=1
    
print(p,n,(p+n)) 


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\C\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\C\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Accuracy is: 0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2065.1 : 1.0
                      :) = True           Positi : Negati =   1637.7 : 1.0
                follower = True           Positi : Negati =     35.8 : 1.0
                    glad = True           Positi : Negati =     22.0 : 1.0
                     sad = True           Negati : Positi =     17.2 : 1.0
                     x15 = True           Negati : Positi =     13.2 : 1.0
                   didnt = True           Negati : Positi =     13.2 : 1.0
                    blog = True           Positi : Negati =     12.8 : 1.0
                    damn = True           Negati : Positi =     11.8 : 1.0
                 perfect = True           Positi : Negati =     11.5 : 1.0
               goodnight = True           Positi : Negati =     11.5 : 1.0
                     ugh = True           Negati : Positi =     11.2 : 1.0
                 welcome = True           

In [None]:
#Create csv file with the columns Day, Positive, Total

Columns = ['Day', 'Positive','Negative','Total']

with open('C:\/Users\/C\/Documents\/Python Scripts\/SentimentStat_USA.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    csv_writer.writerow(Columns)

In [None]:
#Append data to Sentiment_LK.csv

values = [date_since,p,n,(p+n)]

with open('C:\/Users\/C\/Documents\/Sentiment_LK.csv', 'a+') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',')
    #csv_writer.writerow(Columns)
    csv_writer.writerow(values)


In [48]:
# Plot 3 time series for Positive tweets, Negative tweets, and total COVID-19 related tweets during a day

df = pd.read_csv('C:\/Users\/C\/Documents\/Sentiment_LK_copy.csv')

date = df['Day']
x = list(range(len(date)))
bar_plots = [
    go.Bar(x=x, y=df['Positive'], name='Positive', marker=go.bar.Marker(color='#00ff14')),
    go.Bar(x=x, y=df['Negative'], name='Negative', marker=go.bar.Marker(color='#e50000')),  
    go.Bar(x=x, y=df['Total'], name='Total', marker=go.bar.Marker(color='#ffaabb'))   
    ]
layout = go.Layout(
    title=go.layout.Title(text="Sentiment results", x=0.5),
    yaxis_title="Sentiment",
    xaxis_tickmode="array",
    xaxis_tickvals=list(range(27)),
    xaxis_ticktext=tuple(df['Day'].values),
    )
fig = go.Figure(data=bar_plots, layout=layout)
fig.update_xaxes(range=['6/4/2020','6/16/2020'])
fig.show()

In [56]:
#Manual testing with custom tweets

custom_tweet = "Share market force stoped due to the poor selling"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


In [52]:
#Manual testing with custom tweets

custom_tweet = "curfew in Srilanka is lifted from tomorrow"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Positive


In [58]:
#Manual testing with custom tweets

custom_tweet = "Coronavirus Cases in Sri Lanka is currently 1846 and we are badly reaching 2000"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


In [59]:
#Manual testing with custom tweets

custom_tweet = "The silverlining of #COVID19 was the earth getting cleaner"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Positive


In [60]:
#Manual testing with custom tweets

custom_tweet = "Saudi envoy in SriLanka assures full support for COVID-19 project"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Positive


In [61]:
#Manual testing with custom tweets

custom_tweet = "Ten coronavirus patients identified in the country yesterday"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Positive


In [49]:
#Stock market behaviour ASPI

df = pd.read_csv('C:\/Users\/C\/Documents\/Python Scripts\/ASPI_copy.csv')

date = df['Day']
x = list(range(len(date)))
bar_plots = [
    go.Bar(x=x, y=df['Value'], name='ASPI', marker=go.bar.Marker(color='#00aa14'))
    ]
layout = go.Layout(
    title=go.layout.Title(text="Stock market behaviour - Colombo Stock Exchange", x=0.5),
    yaxis_title="All Share Price Index - ASPI",
    
    xaxis_tickmode="array",
    xaxis_tickvals=list(range(27)),
    xaxis_ticktext=tuple(df['Day'].values),
    )

fig = go.Figure(data=bar_plots, layout=layout)
fig.update_yaxes(range=[4750,4950])
fig.show()