In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from textblob import TextBlob
import numpy as np
import nltk
import re
import flair
import os
import datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
def clean_tweet(input_filename, output_filename):
    #opening data and filtering used feature
    df = pd.read_csv(input_filename)
    df = pd.DataFrame(df)
    cols = [3,4,10]
    df = df[df.columns[cols]]
    df.columns=["Date", "Time", "Tweet"]
    df = df.iloc[1: , :]
    df['Timestamp']=df.apply(lambda x:'%s %s' % (x['Date'],x['Time']),axis=1)
    df = df.drop(['Date', 'Time'], axis = 1)
    old_cols = df.columns.values 
    new_cols= ['Timestamp', 'Tweet']
    df = df.reindex(columns=new_cols)
    
    #cleaning tweet
    stopwords = set(nltk.corpus.stopwords.words('english'))
    clean_tweets = []
    for tweet in df['Tweet']:
        # remove http links
        tweet = re.sub(r"(http|https)://\S+", "", tweet)
        # remove punctuation
        tweet = re.sub(r'[^\w\s]','', tweet)
        # remove numbers
        tweet = re.sub(r'\d+','', tweet)
        # remove coin symbol
        tweet = re.sub(r"(btc|eth|ltc|ico)","", tweet)

        # now remove stopwords
        tweet = tweet.split()
        tweet = [w for w in tweet if not w in stopwords]
        tweet = " ".join(word for word in tweet)
        clean_tweets.append(tweet)
    df['Clean Tweet'] = clean_tweets
    df.to_csv(output_filename)

In [5]:
fmt = '%Y-%m-%d %H:00:00'
sid = SentimentIntensityAnalyzer()

2021-08-09 18:59:03,909 loading file C:\Users\azkaz\.flair\models\sentiment-en-mix-distillbert_4.pt


In [9]:
def setiment_analyze(input_filename, output_filename):
    #opening cleaned data
    df = pd.read_csv(input_filename)
    df = df[['Cleaned Tweets','Timestamp']]
    df = df.fillna('')
    df = df.iloc[: , :]
    df.set_index('Timestamp', inplace=True)

    #analyze the tweet using VADER
    for row_i, row in df.iterrows():

        pos_dict = dict()
        neg_dict = dict()

        data = row['Cleaned Tweets']
        print(row_i)
        print(data[0:15])
        ss = sid.polarity_scores(data)
        pos_dict[str(row_i)] = ss['pos']
        neg_dict[str(row_i)] = ss['neg']

        pos_df = pd.DataFrame.from_dict(pos_dict, orient='index',
                                            columns=['twitter_positifitas'])
        pos_df.index.name = 'timestamp'

        neg_df = pd.DataFrame.from_dict(neg_dict, orient='index',
                                            columns=['twitter_negatifitas'])
        neg_df.index.name = 'timestamp'

        final_senti_df = pd.concat([pos_df, neg_df], axis=1)

        if os.path.exists(output_filename):
            keep_header = False
        else:
            keep_header = True

        final_senti_df.to_csv(output_filename, mode='a', header=keep_header)

    return

In [None]:
def categorize_sentiment(input_filename, output_filename):
    start_date_time_obj = datetime.datetime(2021, 8, 1)
    end_date_time_obj = datetime.datetime(2021, 8, 2)
    hr1 = datetime.timedelta(hours=1)
    curr_date_time_obj = start_date_time_obj
    in_df = pd.read_csv(input_filename)


    out_dict = dict()

    while curr_date_time_obj <= end_date_time_obj:
        curr_timestamp = curr_date_time_obj.strftime(format=fmt)
        out_dict[curr_timestamp] = 0
        curr_date_time_obj += hr1

    out_df = pd.DataFrame.from_dict(out_dict, orient='index',
                                    columns=['twitter_flair'])

    print(out_dict)
    out_df.index.name = 'timestamp'
    # menambahkan kolom kosong
    out_df['twitter_positifitas'] = 0
    out_df['twitter_negatifitas'] = 0

    for i in range(len(in_df)):
        timestamp = in_df.loc[i, 'timestamp']
        out_key = datetime.datetime.strptime(str(timestamp), '%Y-%m-%d %H:%M:%S')
        out_key += hr1
        out_key = out_key.strftime(format='%Y-%m-%d %H:00:00')
        # adding data count to count all the data
        try:
            out_df.loc[out_key, 'twitter_positifitas'] += in_df.loc[i, 'twitter_positifitas']
            out_df.loc[out_key, 'twitter_negatifitas'] += in_df.loc[i, 'twitter_negatifitas']
            out_df.loc[out_key, 'count'] += 1
        except:
            pass

    # make timestamp as a column and reindex the dataframe to make loc method happy
    out_df['timestamp'] = out_df.index
    out_df.index = range(len(out_df))

    for i in range(len(out_df)):
        # normalize the value of sentiment analysis according to the total each hour

        if out_df.loc[i, 'twitter_sid_count'] == 0:
            out_df.loc[i, 'twitter_positifitas'] = 0
            out_df.loc[i, 'twitter_negatifitas'] = 0
        else:
            out_df.loc[i, 'twitter_positifitas'] /= out_df.loc[i, 'twitter_sid_count']
            out_df.loc[i, 'twitter_negatifitas'] /= out_df.loc[i, 'twitter_sid_count']

        if os.path.exists(output_filename):
            keep_header = False
        else:
            keep_header = True

    out_df.drop(['twitter_flair_count', 'twitter_tb_polarity_count', 'twitter_tb_subjectivity_count','twitter_sid_count'], axis=1,
                inplace=True)
    # change back index to timestamp to save the data in csv
    out_df.set_index('timestamp', inplace=True)
    out_df.to_csv(output_filename)

In [None]:
if __name__ == '__main__':
    input_filename = 'twitter_data.csv'
    output_cleaned_filename = input_filename[0:-4] + '_cleaned.csv'
    
    # Cleaning tweets
    cleaning_data(input_filename, output_cleaned_filename)
    output_sentiment_filename = input_filename[0:-4] + '_sentiment.csv'
    
    # Sentiment analysis of the tweet
    setiment_analyze(output_cleaned_filename, output_sentiment_filename)
    output_categorize_sentiment_filename = output_sentiment_filename[0:-4] + '_bucketized.csv'

    # Get all sentiment reports and categorize them into hourly basis data according to the timestamp
    categorize_sentiment(output_sentiment_filename, output_categorize_sentiment_filename)