### Importing Libraries

In [None]:
# pip install nltk
# pip install contractions
# pip install whatthelang

Python interpreter will be restarted.
Collecting nltk
  Downloading nltk-3.6.7-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Collecting click
  Downloading click-8.0.3-py3-none-any.whl (97 kB)
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (764 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.0.3 nltk-3.6.7 regex-2021.11.10 tqdm-4.62.3
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-1747d502-3667-497c-a78d-a8b814861f77/bin/python -m pip install --upgrade pip' command.
Python interpreter will be restarted.


In [None]:
import pyspark
import pandas as pd

from pyspark.sql.types import *
from pyspark.sql.functions import udf,col

from whatthelang import WhatTheLang
import contractions
import nltk
import regex
import string


# Creating Spark Dataframe

In [None]:
dbutils.fs.cp("/FileStore/bitcoin-tweets-2016-2019/tweets.csv", # **learn reading data directly from dbfs
             "file:/databricks/driver/tweets.csv")

df = pd.read_csv('tweets.csv', delimiter=';', skiprows=0, lineterminator='\n' )
df = df.loc[:,["text"]]

In [None]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

df = spark.createDataFrame(df) 

df.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
df.repartition(8)

# Data Cleaning

In [None]:

def get_lang(s:str)-> str:
   
    return WhatTheLang().predict_lang(s)
  

predict_lang = udf(lambda z: get_lang(z),StringType())
spark.udf.register("predict_lang", predict_lang)


df = df.withColumnRenamed('text\r','text')\
  .withColumn('lang',predict_lang('text')) \
  .where(col('lang') == 'en') \
  .drop('lang') \
  .drop('id') \
  .drop('url') \
  .drop('fullname')
  
#df.filter(col("text").rlike("(?i)^*follow$|(?i)^*subscribe$")).show(truncate=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:

def transform(text:str)->str:

  #Convert to lower case
  text = text.lower()
  #Convert www.* or https?://* to URL
  text = regex.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
  #Remove @username 
  text = regex.sub('@[^\s]+',' ',text)
  #Remove contractions
  text = ' '.join([contractions.fix(word) for word in text.split()])
  #Remove Punctuations and Numbers
  text = ''.join([i for i in text if ( not i.isdigit() and i not in string.punctuation)])
  
  return text


get_transform = udf(lambda z: transform(z),StringType())
spark.udf.register("get_transform", get_transform)

df = df.withColumn('text',get_transform('text'))
#df = df.filter(col('text').contains('follow') || col('text').contains('free') || col('text').contains('subscribe'))
df.display()

user,timestamp,replies,likes,retweets,text
workwithai,2019-07-25 20:54:29+00,0,0,0,web design osr recruitment lowestoft united kingdom 📋 more info ai aijobs artificialintelligence php jobs hiring careers lowestoft united kingdom bitcoin eth crypto
crypto__mak,2019-09-17 22:28:02+00,0,0,1,‘master’ of alternative investments does not have a clue about bitcoin bitcoinanalysis cryptocurrency blackstone cryptocurrencynews
Maclovin6618,2019-08-06 18:30:09+00,0,0,0,what if the baby had a shot gunsay bitcoinkatkatbooommmripple😂😂
pemilijan,2019-08-10 08:31:59+00,0,0,0,buysell altcoin changes with up to x leverage at primexbt 🤑💰 join right away and convert your into ✅ ✅ receive money even if btc is falling 📉📉 gto storj cro ltc aion gnt xem fct nex wan
ttcsalam93,2019-07-11 07:09:49+00,0,0,0,drife ieo crypto blockchain ethereum bitcoin ether cryptocurrency tokensale
MoneyhealthF,2019-06-01 05:09:53+00,0,0,0,tariffs on mexico will hurt american business business in the st century is … us chamber of commerce and business roundtable — which represent businesses that … news health bitcoin
cryptoconsulti4,2019-08-30 07:59:31+00,0,0,0,
OmniLabOfficial,2019-11-07 01:09:52+00,0,0,0,well done
crypto__mak,2019-08-28 10:28:33+00,0,0,0,some of my favorite bch related things you can use today btc
Cryptonetwealth,2019-10-12 19:35:41+00,0,0,0,that is right i try my best to hit all those marks xrp bitcoin cryptocurrency trading money


In [None]:


def tag(x:str)->str:
  hashtag = set(part[1:] for part in x.split() if part.startswith('#'))
  
  if len(hashtag)==0:
    return ' '
  else:
    return hashtag

get_tag = udf(lambda z: tag(z),StringType())
spark.udf.register("get_tag", get_tag)

df = df.withColumn('tag',get_tag('text'))
df.display()

user,fullname,url,timestamp,replies,likes,retweets,text,tag
workwithai,Jobs In AI,,2019-07-25 20:54:29+00,0,0,0,"Web Design - OSR Recruitment ( Lowestoft, United Kingdom ) - [ 📋 More Info https://t.co/f0xwekYpb7 ] #AI #AiJobs #ArtificialIntelligence #PHP #jobs #Hiring #Careers #Lowestoft #United Kingdom #BitCoin #ETH #crypto https://t.co/b0F18O0RaO","[ArtificialIntelligence, BitCoin, Lowestoft, jobs, Careers, Hiring, AI, United, ETH, PHP, AiJobs, crypto]"
crypto__mak,Crypto Mak 🌐,,2019-09-17 22:28:02+00,0,0,1,‘Master’ of Alternative Investments does not Have a Clue About Bitcoin https://t.co/rpWkGUuKi1 #BitcoinAnalysis #Cryptocurrency #blackstone #CryptocurrencyNews,"[Cryptocurrency, BitcoinAnalysis, blackstone, CryptocurrencyNews]"
Maclovin6618,Maclovin,,2019-08-06 18:30:09+00,0,0,0,@Eljaboom @Ripple @MoneyGram What if the baby had a shot gun....say bitcoin...katkat....booommm...ripple😂😂,
pemilijan,Polyn Emilijan,,2019-08-10 08:31:59+00,0,0,0,Buy/Sell altcoin changes with up to 100x Leverage at PrimeXBT! 🤑💰 Join right away and convert your $50 into $12500: ✅ https://t.co/VXhcWd1UX3 ✅ Receive money even if BTC is falling! 📉📉 $GTO - $STORJ - $CRO - $LTC - $AION - $GNT - $XEM - $FCT - $NEX - $WAN https://t.co/G9D8GFtNWF,
ttcsalam93,ttcsalam,,2019-07-11 07:09:49+00,0,0,0,#Drife #IEO #Crypto #Blockchain #ethereum #bitcoin #ether #cryptocurrency #tokensale,"[cryptocurrency, ethereum, IEO, Crypto, ether, Blockchain, Drife, tokensale, bitcoin]"
MoneyhealthF,MoneyHealth &Finance,,2019-06-01 05:09:53+00,0,0,0,Tariffs on Mexico will hurt American #business Business in the 21st century is … US Chamber of Commerce and Business Roundtable — which represent businesses that … https://t.co/mOmDxD8rF5 #news #health #bitcoin https://t.co/QjiolxilHa,"[news, business, health, bitcoin]"
cryptoconsulti4,cryptoconsulting.info.eng,,2019-08-30 07:59:31+00,0,0,0,https://t.co/yCi6xk6iBs,
OmniLabOfficial,OmniLab,,2019-11-07 01:09:52+00,0,0,0,Well done!,
crypto__mak,Crypto Mak 🌐,,2019-08-28 10:28:33+00,0,0,0,Some of my favorite BCH related things you can use today https://t.co/kzS1VsPIxJ #btc,[btc]
Cryptonetwealth,Luis_G,,2019-10-12 19:35:41+00,0,0,0,"that is right, I try my best to hit all those marks.. #XRP #bitcoin #cryptocurrency #trading #money","[trading, cryptocurrency, money, XRP, bitcoin]"


In [None]:
df.coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').save('dbfs:/FileStore/df/sample_tweets.csv')
# %fs
# ls dbfs:/FileStore/df/sample_tweets.csv/

path,name,size
dbfs:/FileStore/df/sample_tweets.csv/_SUCCESS,_SUCCESS,0
dbfs:/FileStore/df/sample_tweets.csv/_committed_9164375382535667528,_committed_9164375382535667528,111
dbfs:/FileStore/df/sample_tweets.csv/_started_9164375382535667528,_started_9164375382535667528,0
dbfs:/FileStore/df/sample_tweets.csv/part-00000-tid-9164375382535667528-a2d4eb31-0716-4afb-a10a-3eb9fe18d7eb-1-1-c000.csv,part-00000-tid-9164375382535667528-a2d4eb31-0716-4afb-a10a-3eb9fe18d7eb-1-1-c000.csv,2025207


# VADER SENTIMENT EXTRACTION

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import emoji

analyser = SentimentIntensityAnalyzer()

new_words = {
    
    'decrease': -1.5,
    'decreasing': -1.5,
    'decreased': -1.5,
    'increase': 1.5,
    'increasing': 1.5,
    'increased': 1.5,
    'rocket': 1.5,
    'rocketed': 1.5,
    'fire': 1.5,
    'bull':2.0,
    'bulls':2.0,
    'bullish':2.0,
    'bear':-2.0,
    'bears':-2.0,
    'bearish':-2.0,
    'drop':-3.0,
    'dropped':-3.0,
    'droping':-3.0,
    'low':-2.5,
    'lower':-2.5,
    'lowest':-3.5,
    'dip':-2.5,
    'diped':-2.5,
    'crash': -3.5,
    'crashed': -3.5,
    'crashing': -3.5,
    'up': 1.7,
    'down': -1.8,
    'peak': 2.5,
    'peaked': 2.5
    #'hit':-1.5
}

analyser.lexicon.update(new_words)



def apply_vader(row):
  score = analyser.polarity_scores(row['text'])
  return pd.Series([score['compound'], score['neg'], score['neu'], score['pos']])

df[['compound', 'neg', 'neu', 'pos']] = df.apply(apply_vader ,axis=1,)

# Uploading dataframe to s3

In [None]:
import boto3 


AWSAccessKeyId=''
AWSSecretKey=''.replace("/", "%2F")
region='ap-south-1'


s3 = boto3.resource(
    service_name='s3',
    region_name=region,
    aws_access_key_id=AWSAccessKeyId,
    aws_secret_access_key=AWSSecretKey
)


s3.Bucket('bitcoin-tweets').upload_file(Filename='dbfs:/FileStore/df/sample_tweets.csv/part-00000-tid-9164375382535667528-a2d4eb31-0716-4afb-a10a-3eb9fe18d7eb-1-1-c000.csv', Key='sample_tweets.csv')




df.coalesce(1).write \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.save("s3a://{}:{}@{}/{}".format(AWSAccessKeyId, AWSSecretKey, 'bitcoin-tweets', 'sample_data'))