In [18]:
!pip install emoji

Collecting emoji
  Downloading emoji-1.2.0-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 3.8 MB/s eta 0:00:01
[?25hInstalling collected packages: emoji
Successfully installed emoji-1.2.0
You should consider upgrading via the '/Users/christopherkindl/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [64]:
# import pyspark modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, lower, col, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
from pyspark.sql.types import StructType, ArrayType, StringType
#import emoji #requires pip install

In [2]:
# start session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

In [81]:
# read data
df_raw = spark.read.option("header", True).csv("/Users/christopherkindl/Desktop/twitter_results.csv")

In [35]:
# lowercase text
df_raw = df_raw.select(lower(col('tweets')).alias('tweets'))

In [4]:
# create rdd 
df_rdd=df_raw.rdd

In [None]:
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_clean.withColumn("tweets_clean", stemmer_udf("tweets_clean")).select("tweets_clean", "date", "station")

In [83]:
df_raw.write.mode("overwrite").csv("/Users/christopherkindl/working/london-housing-webapp/test-wed.tsv")

In [110]:

df_raw_2 = spark.read.option("header", True).csv("/Users/christopherkindl/working/london-housing-webapp/test-wed.tsv", sep="\t")

In [111]:
df_raw_2.show()

+--------------------+--------------------+-------------------+---------------+
|                 _c0|              tweets|               date|        station|
+--------------------+--------------------+-------------------+---------------+
|                   0|@noordinarypark @...|2021-04-27 07:30:07|     Abbey Road|
|                   1|@Clarecurtislino ...|2021-04-26 07:49:34|     Abbey Road|
|                   2|In #Newham today,...|2021-04-20 20:15:20|     Abbey Road|
|                   3|Just posted a pho...|2021-04-20 07:00:25|     Abbey Wood|
|                   4|Walking on the wi...|2021-04-21 16:20:23|  Acton Central|
|                   5|Just posted a vid...|2021-04-19 17:03:53|  Acton Central|
|                   6|Walking on the wi...|2021-04-21 16:20:23|Acton Main Line|
|                   7|"Click the link i...|2021-04-27 12:33:20|     Acton Town|
|                   8|Can you recommend...|2021-04-25 15:43:15|     Acton Town|
|                   9|If you're looking.

In [115]:
#df.to_parquet('/Users/christopherkindl/Desktop/df.parquet.gzip', compression='gzip')
input_loc = '/Users/christopherkindl/Desktop/df.parquet.gzip'
df_raw_3 = spark.read.option("header", True).parquet(input_loc, compression='gzip')

In [116]:
df_raw_3.show(25)

+--------------------+-------------------+----------------+
|              tweets|               date|         station|
+--------------------+-------------------+----------------+
|@noordinarypark @...|2021-04-27 08:30:07|      Abbey Road|
|@Clarecurtislino ...|2021-04-26 08:49:34|      Abbey Road|
|In #Newham today,...|2021-04-20 21:15:20|      Abbey Road|
|Just posted a pho...|2021-04-20 08:00:25|      Abbey Wood|
|Walking on the wi...|2021-04-21 17:20:23|   Acton Central|
|Just posted a vid...|2021-04-19 18:03:53|   Acton Central|
|Walking on the wi...|2021-04-21 17:20:23| Acton Main Line|
|Click the link in...|2021-04-27 13:33:20|      Acton Town|
|Can you recommend...|2021-04-25 16:43:15|      Acton Town|
|If you're looking...|2021-04-22 13:15:30|      Acton Town|
|You want to work ...|2021-04-20 13:54:59|      Acton Town|
|This is what #daw...|2021-04-20 09:54:44|      Acton Town|
|First time back i...|2021-04-27 12:54:44|         Aldgate|
|St Dunstan-in-the...|2021-04-27 11:12:3

In [73]:
# lowercase text
#df_clean = df_raw.select('id', (lower(regexp_replace('text', "[^a-zA-Z\\s]", "")).alias('text')))
df_lower = df_raw.select("date", "station", (lower(col("tweets")).alias('tweets')))

# tokenize text
tokenizer = Tokenizer(inputCol="tweets", outputCol="tweets_token")
df_tokens = tokenizer.transform(df_lower).select("tweets_token", "date", "station")

# remove stop words
#stopwordList = ["https://","london"]

remover = StopWordsRemover(inputCol="tweets_token", outputCol="tweets_sw_removed")#, stopWords=stopwordList)
df_clean = remover.transform(df_tokens).select("tweets_sw_removed", "date", "station")

# stemming
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_clean.withColumn("tweets", stemmer_udf("tweets_sw_removed")).select("tweets", "date", "station")

In [77]:
df_stemmed.show()

+--------------------+--------------------+-------------------+
|              tweets|                date|            station|
+--------------------+--------------------+-------------------+
|[#newham, today,,...| 2021-04-20 20:15:20|         Abbey Road|
|[massiv, upheav, ...| 2021-04-19 11:26:43|         Abbey Road|
|[post, photo, @, ...| 2021-04-20 07:00:25|         Abbey Wood|
|[walk, wild, side...| 2021-04-21 16:20:23|      Acton Central|
|[post, video, @, ...| 2021-04-19 17:03:53|      Acton Central|
|[walk, wild, side...| 2021-04-21 16:20:23|    Acton Main Line|
|[london,, back,, ...|                null|               null|
|[#london, #restau...| 2021-04-17 17:18:34|    Acton Main Line|
|["join, intelsat,...| via th… https://...|2021-04-23 10:14:21|
|["starbuck, look,...| 2021-04-23 09:03:37|         Acton Town|
|[look, work, #lon...| 2021-04-22 12:15:30|         Acton Town|
|[want, work, inte...| 2021-04-20 12:54:59|         Acton Town|
|[#dawn, look, lik...| 2021-04-20 08:54:

In [79]:
# run bag of words
bow0 = df_raw.rdd\
    .filter(lambda x: x.tweets)\
    .map( lambda x: x.tweets.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
    .flatMap(lambda x: x.split())\
    .map(lambda x: (x, 1))


In [80]:
bow0.reduceByKey(lambda x,y:x+y).take(20)

[('in', 54),
 ('#newham', 1),
 ('today', 1),
 ('was', 1),
 ('great', 3),
 ('to', 11),
 ('meet', 1),
 ('with', 6),
 ('the', 29),
 ('local', 1),
 ('police', 1),
 ('and', 11),
 ('have', 2),
 ('a', 29),
 ('walkabout', 1),
 ('community', 1),
 ('speak', 1),
 ('chief…', 1),
 ('https://t', 112),
 ('co/6vnnlrqj4o', 1)]

In [22]:
# csv output test
df_raw.write.mode("overwrite").csv("/Users/christopherkindl/Desktop/wed_test.csv")

In [6]:
# run bag of words
bow0 = df_raw.rdd\
    .filter(lambda x: x.tweets)\
    .map( lambda x: x.tweets.replace(',',' ').replace('.',' ').replace('-',' ').lower())\
    .flatMap(lambda x: x.split())\
    .map(lambda x: (x, 1))

In [8]:
bow0.reduceByKey(lambda x,y:x+y).take(20)

[('in', 54),
 ('#newham', 1),
 ('today', 1),
 ('was', 1),
 ('great', 3),
 ('to', 11),
 ('meet', 1),
 ('with', 6),
 ('the', 29),
 ('local', 1),
 ('police', 1),
 ('and', 11),
 ('have', 2),
 ('a', 29),
 ('walkabout', 1),
 ('community', 1),
 ('speak', 1),
 ('chief…', 1),
 ('https://t', 112),
 ('co/6vnnlrqj4o', 1)]

In [86]:
def func(**kwargs):
    return print('test')

In [87]:
func()

test


In [88]:
str(func)

'<function func at 0x11723ee50>'

In [89]:
test = str(func)

In [90]:
test

'<function func at 0x11723ee50>'

In [94]:
from datetime import datetime

In [91]:
step = func.__name__

In [95]:
step = datetime.now()

In [96]:
step

datetime.datetime(2021, 4, 27, 18, 40, 30, 216184)

In [None]:
d = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
d
datetime.datetime(2016, 1, 4, 17, 31, 32, 976902)
>>> print(d)
2016-01-04 17:31:32.976902
>>> print(d.strftime('%Y-%m-%d %H:%M:%S'))
2016-01-04 17:31:32

In [97]:
d = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [98]:
d

'2021-04-27 19:04:38'

In [99]:
datetime.now().strftime('%Y%m%d')

'20210427'

In [100]:
bucket_name = 'london-housing-webapp'
path = 'final/test.csv'

source = 's3://' + bucket_name + path
source

's3://london-housing-webappfinal/test.csv'

In [103]:
obj = []
obj.append([job_nr=datetime.now().strftime('%Y%m%d'),
                timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S')])

SyntaxError: invalid syntax (<ipython-input-103-31b479fd9c81>, line 2)

In [104]:
job_nr=datetime.now().strftime('%Y%m%d')

In [105]:
timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [106]:
num = 0
job_nr = num + 1

In [107]:
job_nr

1