# Data Preprocessing
This notebook is designed to load financial data, scrape tweets, calculate the sentiment analysis of the tweets, and preprocess data.

The setup of the cells will be as follows:
1. Load modules
1. Load and slice the data
1. Scrape tweets with snscrape
1. Clean tweets
1. Load the VADER sentiment analyzer
1. Add the compound values to the according dataset

### 1. Load modules

In [31]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from datetime import datetime
import snscrape.modules.twitter as sntwitter
import numpy as np
import re

### 2. Load and slice the data

In [7]:

df_DOGE = pd.read_csv('../datasets/DOGE-USD.csv')
df_DOGE = df_DOGE[1000:]
keywords_doge = '(doge%20OR%20dogecoin%20OR%20dogecrypto%20)'

df_MONA = pd.read_csv('../datasets/MONA-USD.csv')
df_MONA = df_MONA[1000:]
keywords_mona = '(mona%20OR%20monacoin%20OR%20monacrypto)'

df_SHIB = pd.read_csv('../datasets/SHIB-USD (1).csv')
keywords_shib = '(shiba%20OR%20shibacoin%20OR%20shibacrypto%20shiba inu)'


### 3. Scrape tweets with snscrape

In [None]:
def anyOfWords(df, keyword=''):
    dates = [x for x in df['Date']]
    dates.append('2022-11-17')

    tweets_list = []
    maxTweets = 100
    for x in range(len(dates)):
        if x+1 == len(dates):
            break
        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f'{keyword} +  since:{dates[x]} until:{dates[x+1]} lang:en' ).get_items()):

            if i > maxTweets:
                break
            else:
                tweets_list.append([tweet.date,tweet.content])
                print(tweet.date)



    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime','Text']) 
    tweets_df.to_csv('shib_tweets.csv')   


    return tweets_list

#tweets_doge = anyOfWords(df_DOGE, keywords_doge, 'doge_tweets.csv')
#tweets_mona = anyOfWords(df_MONA, keywords_mona)
tweets_shib = anyOfWords(df_SHIB, keywords_shib)



In [None]:
tw_shib = pd.read_csv('shib_tweets.csv')

### 4. Clean tweets

In [10]:
def tweet_cleaner(text):

    #remove RT
    text = re.sub("RT @[\w]*:","",text)
    #remove twitter handles (@user)
    text = re.sub("(@[A-Za-z0-9_]+)","", text)
    #remove url links
    text = re.sub("https?://[A-Za-z0-9./]*","",text)
    # remove whitespaces
    text = ' '.join(text.split())

    return text

tw_shib['Text'] = tw_shib['Text'].apply(lambda x: tweet_cleaner(x))
tw_shib['Datetime'] = pd.to_datetime(tw_shib['Datetime']).dt.date
tw_shib.to_csv('shib_tweets_clean.csv')
print(tw_shib)

       Unnamed: 0    Datetime  \
0               0  2020-08-01   
1               1  2020-08-01   
2               2  2020-08-01   
3               3  2020-08-01   
4               4  2020-08-01   
...           ...         ...   
82525       82525  2022-11-16   
82526       82526  2022-11-16   
82527       82527  2022-11-16   
82528       82528  2022-11-16   
82529       82529  2022-11-16   

                                                    Text  
0      Home at last! Kuma - Shiba Inu from #Chicago #...  
1      me and her/him did a successful trade they wen...  
2                                The beige shiba inu won  
3       is trusted yey, i gave her 15k for her shiba inu  
4      Trying something a little different #Grounded!...  
...                                                  ...  
82525  Shiba Inu (SHIB) Uncovers Tech Trench HUB For ...  
82526  Shiba Inu (SHIB) Uncovers Tech Trench HUB For ...  
82527  Shiba Inu (SHIB) Uncovers Tech Trench HUB For ...  
82528        

### 5. Load the VADER sentiment analyzer

In [11]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()

    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)

    #print(sentiment_dict)
    return sentiment_dict
    #print("Overall sentiment dictionary is : ", sentiment_dict)
    #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    #if sentiment_dict['compound'] >= 0.05 :
        #print("Positive")
 
    #elif sentiment_dict['compound'] <= - 0.05 :
        #print("Negative")
 
    #else :
        #print("Neutral")

### 6. Add the compound values to the according dataset

In [None]:
def add_compound(df_tweets, df_off, name):
    dates = [x for x in df_off['Date']]
    list_com_per_day = []

    for day in range(len(dates)):
        list_com = []
        count = 0
        for x in range(len(df_tweets)):
            if dates[day] == str(df_tweets['Datetime'][x]):
                count+=1
                d = sentiment_scores(df_tweets['Text'][x])
                print(df_tweets['Datetime'][x], d)
                list_com.append(d['compound'])
        if count != 0:
            com_per_day = sum(list_com)/count
            list_com_per_day.append(com_per_day)
           
        else:
            list_com_per_day.append('nan')
            


    list_com_per_day = np.array(list_com_per_day)

    df_with_com = df_off
    df_with_com['com'] = list_com_per_day
    df_with_com.to_csv(f'{name}_com.csv')
    return df_with_com


tw_shib = pd.read_csv('shib_tweets_clean.csv')
add_compound(tw_shib, df_SHIB, 'shib')       

