In [50]:
import tweepy #requires pip install
import numpy as np
import pandas as pd
import config

In [51]:
consumer_key = config.consumer_key
consumer_secret = config.consumer_secret
access_token = config.access_token
access_token_secret = config.access_token_secret

In [52]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [53]:
stations = pd.read_csv ('/Users/christopherkindl/working/data-engineering-group-assignment/\
01_web_scraping/Transport Data.csv')

In [78]:
# test query
number_of_tweets = 20
tweets = []
station = []
date = []

number_of_stations = 20

for index in range(len(stations[:number_of_stations])):
    for i in tweepy.Cursor(api.search, q = 'london', lang = 'en', geocode= \
                           str(stations['Latitude'][index])+','+str(stations['Longitude'][index])+',1km').\
        items(number_of_tweets):
        tweets.append(i.text)
        station.append(stations['Station'][index])
        date.append(i.created_at)

In [79]:
df = pd.DataFrame({'tweets': tweets, 'date':date, 'station': station})

In [80]:
df

Unnamed: 0,tweets,date,station
0,"In #Newham today, was great to meet with the l...",2021-04-20 20:15:20,Abbey Road
1,While there is massive upheaval going on in th...,2021-04-19 11:26:43,Abbey Road
2,"Just posted a photo @ Abbey Wood, London, SE2 ...",2021-04-20 07:00:25,Abbey Wood
3,Walking on the wild side #LondonBridge #TowerB...,2021-04-21 16:20:23,Acton Central
4,"Just posted a video @ Bedford Park, London htt...",2021-04-19 17:03:53,Acton Central
...,...,...,...
83,"This is what happens when you let ""weeds"" take...",2021-04-21 10:51:04,Archway
84,New Householder Application planning applicati...,2021-04-20 01:38:16,Archway
85,New work. Acrylic on canvas 100cm x100cm. #art...,2021-04-19 08:01:32,Archway
86,"I'm at Norman’s in London, England https://t.c...",2021-04-18 11:51:21,Archway


In [59]:
# pyspark modules
from pyspark.sql import SparkSession
from textblob import TextBlob
from pyspark.sql.functions import udf

In [60]:
!pip install vaderSentiment

You should consider upgrading via the '/Users/christopherkindl/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0 # Neutral
    elif temp >= 0.0:
        return 1.0 # Positive
    else:
        return 2.0 # Negative

In [61]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)

In [63]:
sentence = 'Make sure that you dont sound to cooperate!'
vs = analyzer.polarity_scores(sentence)

In [71]:
vs

{'neg': 0.0, 'neu': 0.73, 'pos': 0.27, 'compound': 0.3802}

In [90]:
def apply_vader(sentence):
    vs = analyzer.polarity_scores(sentence)    
    return float(round(vs.get('compound'), 2))

# assign sentiment function as an user defined function
sentiment = udf(apply_vader)

In [91]:
spark = SparkSession.builder.appName('SentimentAnalysis').getOrCreate()
df_raw = spark.read.option('header', True).csv('/Users/christopherkindl/working/twitter_output_1.csv')
df_clean = df_raw.withColumn('sentiment', sentiment(df_raw['tweets']))
df_clean.show()

+---+--------------------+-------------------+-------------+---------+
|_c0|              tweets|               date|      station|sentiment|
+---+--------------------+-------------------+-------------+---------+
|  0|In #Newham today,...|2021-04-20 20:15:20|   Abbey Road|     0.62|
|  1|While there is ma...|2021-04-19 11:26:43|   Abbey Road|     0.42|
|  2|Just posted a pho...|2021-04-20 07:00:25|   Abbey Wood|      0.0|
|  3|Walking on the wi...|2021-04-21 16:20:23|Acton Central|      0.0|
|  4|Just posted a vid...|2021-04-19 17:03:53|Acton Central|      0.0|
+---+--------------------+-------------------+-------------+---------+



In [76]:
df_raw = spark.read.option('header', True).csv('/Users/christopherkindl/working/twitter_output_1.csv')

In [104]:
# convert to pandas df first to avoid folder creation which happens when using spark csv function and export to csv
header = ["tweets", "date", "station", "sentiment"]
df_clean.toPandas().to_csv('/Users/christopherkindl/desktop/twitter_output_20.csv', columns = header, index = False)

In [96]:
df_clean.write.option("header", "true").csv('/Users/christopherkindl/desktop/twitter_output_6.csv')