## Import Libraries

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os
import subprocess

import json
import csv

import pandas as pd
import numpy as np

from datetime import datetime, date, time

import snscrape.modules.twitter as sntwitter
import yfinance as yf

import re

import time

import glob

from textblob import TextBlob

## scrapeTweets() and dataWrangle() functions

In [4]:
# Keep note of directory_name -- it is based on the stock in question of the form tweets_(STOCK_NAME)
# example: tweets_GME or tweets_TSLA

def scrapeTweets(start, stop, keyword, directory, tweet_limit=1):
    if not os.path.exists(directory): # Creates directory in current directory if doesn't already exist
        os.mkdir(directory)
    
    file_path = os.path.join(directory, f'keyword:{keyword}__start:{start}_end:{stop}__limit:{tweet_limit}.csv')
    
    tweet_list = []
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{keyword} since:{start} until:{stop}').get_items()):
        if i > tweet_limit:
            break
        tweet_list.append([tweet.date, # Appending all tweet data into a list of list
                           tweet.id, 
                           tweet.content, 
                           tweet.user.username, 
                           tweet.user.followersCount, 
                           tweet.hashtags, 
                           tweet.cashtags, 
                           tweet.lang])
    
    df_tweets = pd.DataFrame(tweet_list, columns=['Datetime', # Creating df of tweet data
                                                  'Tweet Id', 
                                                  'Text', 
                                                  'Username', 
                                                  'Followers Count', 
                                                  'Hashtags', 
                                                  'Cashtags', 
                                                  'Language'])
    
    df_tweets.to_csv(file_path, index=False) # Writing df_tweets into new csv file
    
    if os.path.isfile(file_path) == True:
        return print(f'Successfully saved DataFrame to {file_path}')
    else:
        return print('DataFrame not saved -- possible error has occurred.')        

In [5]:
# Creating function to clean dataframes programatically


def dataWrangle(dataframe_list):   
    df_concat = pd.concat(dataframe_list) # Concatenate all the DataFrames from the list of DataFrames
    df_filter = df_concat[df_concat['Language'] == 'en'][['Datetime', # Filter via Language = 'en'
                                                           'Tweet Id', # Remove unwanted columns
                                                           'Text', 
                                                           'Username',
                                                           'Followers Count']] 
    df_clean = df_filter.astype({'Tweet Id':str})\
                        .dropna()\
                        .drop_duplicates()\
                        .reset_index(drop=True)
    
    df_clean['Text'] = (df_clean['Text'] # Cleans out redundant string characters within each tweet
                       .apply(lambda x: ' '.join(re.sub(r'https\S+', '', x)
                                                .replace('\n', ' ')
                                                .split()
                                                )
                             )
                       )
    
    return df_clean
                                               

## Creating DataFrames of Tweets

In [6]:
# List of file names

# Files from two directories -- scraped at different time irl 
# for the same dates within the data

file_path_tweets_GME = glob.glob('tweets_GME/*')
file_path_tweets_scraped = glob.glob('tweets_scraped/*')

In [7]:
# Creating a list of DataFrames using glob's list of files names
df_list = []
for i in file_path_tweets_GME + file_path_tweets_scraped:
    df_list.append(pd.read_csv(i))

In [8]:
start_time = date(2021, 1, 8).strftime('%Y-%m-%d')
stop_time = date(2021, 2, 12).strftime('%Y-%m-%d')

def toDateTimeIndex(df): # year-month-day
    df['Datetime'] = pd.to_datetime(df['Datetime']).dt.floor('d').dt.tz_localize(tz=None)
    df.index = pd.DatetimeIndex(df['Datetime']) # Can use .set_index(<column name>) instead to make function a one liner
    df = df.drop(columns=['Datetime'])
    return df.sort_index()

In [9]:
df = dataWrangle(df_list)

In [11]:
df['Datetime'] = pd.to_datetime(df['Datetime']).dt.floor('d').dt.tz_localize(tz=None)

## Polarity Scores

A list of goals to accomplish: \
    - Figure out polarity/subjectivity scores for every tweet\
    - For now we want to work with dates from 01/21/2021 and onwards as a test\
    - Mess about with distribution of negative/positive/neutral scores to have a healthy balance of samples\
    - Break everything up by days\
    - Move onto bringing in financial data

In [14]:
df['Polarity/Subjectivity Scores'] = df['Text'].apply(lambda text: TextBlob(text).sentiment)

In [17]:
df_pol_clean = df[df['Polarity/Subjectivity Scores'] != (0.0,0.0)].set_index('Datetime').sort_index()

In [19]:
df

Unnamed: 0,Datetime,Tweet Id,Text,Username,Followers Count,Polarity/Subjectivity Scores
0,2021-01-08,1347684080905814016,$GME NEW ARTICLE : GameStop Is Caught in a Vic...,StckPro,4198,"(-0.09090909090909091, 0.6886363636363636)"
1,2021-01-08,1347683977327497217,@RamBhupatiraju @richard_chu97 @saxena_puru @F...,tmyrbrgh,263,"(0.0, 0.0)"
2,2021-01-08,1347681621953159169,GameStop Is Caught in a Vicious Cycle $GME $TG...,newsfilterio,20861,"(-1.0, 1.0)"
3,2021-01-08,1347637710866030592,@michaeljburry what are your thought on what s...,JohnMOFOThomas,5,"(0.225, 0.625)"
4,2021-01-08,1347618202612760576,"@ryancohen Can't stop, won't stop, GameStop! C...",AeternumLibera,36,"(0.0, 0.0)"
...,...,...,...,...,...,...
106623,2021-02-05,1357705012919508995,"@carlquintanilla @CNBC They Stopped Us Again, ...",BleezyforSheezy,131,"(0.5, 0.6)"
106624,2021-02-05,1357705010222673923,$GME said,prodigenoir,848,"(0.0, 0.0)"
106625,2021-02-05,1357705006724620290,$GME and $AMC halted 🤨,tweek3634,24,"(0.0, 0.0)"
106626,2021-02-05,1357705004996517891,BUY #AMC and #GME 💎🙌🚀🦍🍌 (not financial advice),YoSheenn,201,"(0.0, 0.0)"


In [35]:
df_sorted_dates = df_pol_clean["2021-01-21":][['Text', 'Polarity/Subjectivity Scores']]

In [36]:
df_sorted_dates

Unnamed: 0_level_0,Text,Polarity/Subjectivity Scores
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-21,"So, Dr. Burry, some free advice for a good guy...","(0.15555555555555559, 0.6611111111111111)"
2021-01-21,"@LizClaman @GameStop @ClamanCountdown Liz, And...","(-0.18787878787878784, 0.4055555555555555)"
2021-01-21,@FarisBakkar @Long_GME E-commerce growth for G...,"(0.05833333333333335, 0.4583333333333333)"
2021-01-21,@FarisBakkar @Long_GME You are honestly recycl...,"(0.049999999999999975, 0.5083333333333333)"
2021-01-21,UPDATE: Citron's Andrew Left On GameStop Short...,"(0.075, 0.3989583333333333)"
...,...,...
2021-02-11,@KChampbell @2009Michael1984 Just think if Blo...,"(0.05393939393939393, 0.4242424242424243)"
2021-02-11,"@aurban22 @RocketCatchnBob Yes, you should be ...","(0.45, 0.7866666666666667)"
2021-02-11,"My retarded question is, since diamond hands a...","(-0.2, 0.45)"
2021-02-11,Cannabis stocks lit up as the Reddit rally dro...,"(0.18181818181818182, 0.5)"


In [97]:
def binaryPolarity(tup):
    if tup[0] < -0.1:
        return 'Negative'
    elif tup[0] > 0.1:
        return 'Positive'
    else:
        return 'Neutral'

In [105]:
def binaryPolarity(tup):
    if tup[0] < 0:
        return 'Negative'
    elif tup[0] > 0:
        return 'Positive'
    else:
        return 'Neutral'

In [99]:
binaryPolarity(df_sorted_dates['Polarity/Subjectivity Scores'][12312])

'Neutral'

In [106]:
df_sorted_dates['Polarity Categories'] = df_sorted_dates['Polarity/Subjectivity Scores'].apply(lambda x: binaryPolarity(x))

In [104]:
df_sorted_dates.value_counts('Polarity Categories')

Polarity Categories
Positive    34678
Neutral     25397
Negative    13915
dtype: int64

In [107]:
df_sorted_dates.value_counts('Polarity Categories')

Polarity Categories
Positive    44135
Negative    21719
Neutral      8136
dtype: int64