In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os
import subprocess

import json
import csv

import pandas as pd
import numpy as np

from datetime import datetime, date, time

import snscrape.modules.twitter as sntwitter
import yfinance as yf

import re

import time

import glob

In [16]:
# Keep note of directory_name -- it is based on the stock in question of the form tweets_(STOCK_NAME)
# example: tweets_GME or tweets_TSLA

def scrapeTweets(start, stop, keyword, directory, tweet_limit=1):
    if not os.path.exists(directory): # Creates directory in current directory if doesn't already exist
        os.mkdir(directory)
    
    file_path = os.path.join(directory, f'keyword:{keyword}__start:{start}_end:{stop}__limit:{tweet_limit}.csv')
    
    tweet_list = []
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{keyword} since:{start} until:{stop}').get_items()):
        if i > tweet_limit:
            break
        tweet_list.append([tweet.date, # Appending all tweet data into a list of list
                           tweet.id, 
                           tweet.content, 
                           tweet.user.username, 
                           tweet.user.followersCount, 
                           tweet.hashtags, 
                           tweet.cashtags, 
                           tweet.lang])
    
    df_tweets = pd.DataFrame(tweet_list, columns=['Datetime', # Creating df of tweet data
                                                  'Tweet Id', 
                                                  'Text', 
                                                  'Username', 
                                                  'Followers Count', 
                                                  'Hashtags', 
                                                  'Cashtags', 
                                                  'Language'])
    
    df_tweets.to_csv(file_path, index=False) # Writing df_tweets into new csv file
    
    if os.path.isfile(file_path) == True:
        return print(f'Successfully saved DataFrame to {file_path}')
    else:
        return print('DataFrame not saved -- possible error has occurred.')        

In [17]:
# Creating function to clean dataframes programatically
# Note: it might be a good idea to put scraping, saving, and cleaning into one function?
# Check the integrity of each wrangled dataframe just in case ***


def dataWrangle(dataframe_list):
    """Input list of dataframes and start process via concatonation
        Language = English
        Columns = Datetime, Tweet Id, Text, Username, Followers Count
        
        Convert Tweet Id type to string to rectify merging issues later
        Drop NA
        Drop Duplicates
        Reset index
        
        returns cleaned up dataframe"""
    
    df_concat = pd.concat(dataframe_list) # Concatenate all the DataFrames from the list of DataFrames
    df_filter = df_concat[df_concat['Language'] == 'en'][['Datetime', # Filter via Language = 'en'
                                                           'Tweet Id', # Remove unwanted columns
                                                           'Text', 
                                                           'Username',
                                                           'Followers Count']] 
    df_clean = df_filter.astype({'Tweet Id':str})\
                        .dropna()\
                        .drop_duplicates()\
                        .reset_index(drop=True)
    
    df_clean['Text'] = (df_clean['Text'] # Cleans out redundant string characters within each tweet
                       .apply(lambda x: ' '.join(re.sub(r'https\S+', '', x)
                                                .replace('\n', ' ')
                                                .split()
                                                )
                             )
                       )
    
    return df_clean
                                               

In [14]:
# List of file names

# Files from two directories -- scraped at different time irl 
# for the same dates within the data

file_path_tweets_GME = glob.glob('tweets_GME/*')
file_path_tweets_scraped = glob.glob('tweets_scraped/*')

In [19]:
# Creating a list of DataFrames using glob's list of files names
df_list = []
for i in file_path_tweets_GME + file_path_tweets_scraped:
    df_list.append(pd.read_csv(i))

In [12]:
start_time = date(2021, 1, 8).strftime('%Y-%m-%d')
stop_time = date(2021, 2, 12).strftime('%Y-%m-%d')

def toDateTimeIndex(df): # year-month-day
    df['Datetime'] = pd.to_datetime(df['Datetime']).dt.floor('d').dt.tz_localize(tz=None)
    df.index = pd.DatetimeIndex(df['Datetime'])
    df = df.drop(columns=['Datetime'])
    return df.sort_index()

In [20]:
df_final = dataWrangle(df_list)

In [21]:
df_final

Unnamed: 0,Datetime,Tweet Id,Text,Username,Followers Count
0,2021-01-08 23:18:18+00:00,1347684080905814016,$GME NEW ARTICLE : GameStop Is Caught in a Vic...,StckPro,4198
1,2021-01-08 23:17:53+00:00,1347683977327497217,@RamBhupatiraju @richard_chu97 @saxena_puru @F...,tmyrbrgh,263
2,2021-01-08 23:08:32+00:00,1347681621953159169,GameStop Is Caught in a Vicious Cycle $GME $TG...,newsfilterio,20861
3,2021-01-08 20:14:02+00:00,1347637710866030592,@michaeljburry what are your thought on what s...,JohnMOFOThomas,5
4,2021-01-08 18:56:31+00:00,1347618202612760576,"@ryancohen Can't stop, won't stop, GameStop! C...",AeternumLibera,36
...,...,...,...,...,...
106623,2021-02-05 14:57:54+00:00,1357705012919508995,"@carlquintanilla @CNBC They Stopped Us Again, ...",BleezyforSheezy,131
106624,2021-02-05 14:57:54+00:00,1357705010222673923,$GME said,prodigenoir,848
106625,2021-02-05 14:57:53+00:00,1357705006724620290,$GME and $AMC halted ü§®,tweek3634,24
106626,2021-02-05 14:57:52+00:00,1357705004996517891,BUY #AMC and #GME üíéüôåüöÄü¶çüçå (not financial advice),YoSheenn,201
