In [1]:
# Import all necessary libraries and setup the environment for matplotlib
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, VectorAssembler
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *
import pyspark.sql.functions as f
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def blank_as_null(x):
    return when(col(x) != "", col(x)).otherwise(None)

In [3]:
spark = SparkSession \
    .builder \
    .appName("Assignment 2") \
    .getOrCreate()
tweeets_data = spark.read.option('multiline','true').json('tweets.json')

In [4]:
tweets_agg = tweeets_data.groupby("user_id").agg(f.concat_ws(",", f.collect_list(tweeets_data.id)).alias('agg_tweets'),
                                    f.concat_ws(",", f.collect_list(tweeets_data.retweet_user_id)).alias('agg_retweet_users'),
                                    f.concat_ws(",", f.collect_list(tweeets_data.retweet_id)).alias('agg_retweets'),
                                    f.concat_ws(",", f.collect_list(tweeets_data.replyto_user_id)).alias('agg_reply_users'),
                                    f.concat_ws(",", f.collect_list(tweeets_data.replyto_id)).alias('agg_replies'))

In [5]:
tweets_agg = tweets_agg.withColumn("agg_reply_users", blank_as_null("agg_reply_users"))
tweets_agg = tweets_agg.withColumn("agg_retweet_users", blank_as_null("agg_retweet_users"))
tweets_agg = tweets_agg.withColumn("agg_retweets", blank_as_null("agg_retweets"))
tweets_agg = tweets_agg.withColumn("agg_replies", blank_as_null("agg_replies"))

In [6]:
tweets_processed = tweets_agg.select('*',concat_ws(',','agg_retweets','agg_replies').alias('agg_tweet_respond'))

In [7]:
tokenizer = Tokenizer(inputCol='agg_tweet_respond',
    outputCol="vectors")
tweets_vectors = tokenizer.transform(tweets_processed)

In [8]:
hashingTF = HashingTF(inputCol="vectors", outputCol="tf")
tf = hashingTF.transform(tweets_vectors)

idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
tfidf = idf.transform(tf)

In [9]:
normalizer = Normalizer(inputCol="feature", outputCol="norm")
data = normalizer.transform(tfidf)

In [10]:
selected_id = 15466159
tweets_user_filtered = data.where(f'user_id = {selected_id}')
compare_vector = tweets_user_filtered.first()['norm']

In [11]:
def cos_sim(a,b=compare_vector):
    return float(a.dot(b) / (a.norm(2) * b.norm(2)))

In [12]:
cos_function = udf(cos_sim, FloatType())

In [13]:
data = data.withColumn("CosineSim",cos_function('norm'))

In [33]:
data.show()

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|   agg_tweet_respond|             vectors|                  tf|             feature|                norm|CosineSim|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|           null|               null| 1390027514332991489|[1390027514332991...|(262144,[23124],[...|(262144,[23124],[...|(262144,[23124],[...|      1.0|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|           null|      

In [14]:
data.filter(data.CosineSim > 0).show(truncate=False)

+-------------------+-------------------+-----------------+-------------------+---------------+-----------+-------------------+---------------------+----------------------+-------------------------------------+----------------------+---------+
|user_id            |agg_tweets         |agg_retweet_users|agg_retweets       |agg_reply_users|agg_replies|agg_tweet_respond  |vectors              |tf                    |feature                              |norm                  |CosineSim|
+-------------------+-------------------+-----------------+-------------------+---------------+-----------+-------------------+---------------------+----------------------+-------------------------------------+----------------------+---------+
|15466159           |1390054010502225922|807095           |1390027514332991489|null           |null       |1390027514332991489|[1390027514332991489]|(262144,[23124],[1.0])|(262144,[23124],[1.4383260192134937])|(262144,[23124],[1.0])|1.0      |
|32947971           |139

In [43]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="vectors", outputCol="features", vocabSize=10000, minDF=3.0)

tweets_cv = cv.fit(tweets_vectors)

result = tweets_cv.transform(tweets_vectors)
result.show(truncate=True)

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+-----------------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|   agg_tweet_respond|             vectors|         features|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+-----------------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|           null|               null| 1390027514332991489|[1390027514332991...|  (179,[0],[1.0])|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|           null|               null| 1390023742194061312|[1390023742194061...|  (179,[1],[1.0])|
| 30616018| 1390052446920220675|           931571402| 1390026843068239874|           null|               null| 1390026843068239874|[1390026843068

In [39]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [34]:
tweets_vectors.show()

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|   agg_tweet_respond|             vectors|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|           null|               null| 1390027514332991489|[1390027514332991...|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|           null|               null| 1390023742194061312|[1390023742194061...|
| 30616018| 1390052446920220675|           931571402| 1390026843068239874|           null|               null| 1390026843068239874|[1390026843068239...|
| 32947971| 1390058861415997442|              807095| 1390027514332991489|        

In [15]:
from pyspark.ml.feature import Word2Vec


In [16]:
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="vectors", outputCol="model")
model = word2Vec.fit(tweets_vectors)

In [31]:
test = model.getVectors().join(tweets_vectors, col("word") == col("agg_tweet_respond"))

In [32]:
tweets_vectors.show()

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|   agg_tweet_respond|             vectors|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|           null|               null| 1390027514332991489|[1390027514332991...|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|           null|               null| 1390023742194061312|[1390023742194061...|
| 30616018| 1390052446920220675|           931571402| 1390026843068239874|           null|               null| 1390026843068239874|[1390026843068239...|
| 32947971| 1390058861415997442|              807095| 1390027514332991489|        

In [19]:
import numpy as np
def cos_sim_w2v(a,b=compare_vector):
    return float(np.dot(a,b)/(np.linalg.norm(a)* np.linalg.norm(b)))
cos_function_w2v = udf(cos_sim_w2v, ArrayType(StringType()))

In [27]:
def apply_w2v(row):
    word2vec = Word2Vec()
    return word2vec.fit(row)
w2v_function = udf(apply_w2v,ArrayType(FloatType()))

In [30]:

test = tweets_vectors.withColumn("w2v",w2v_function('agg_tweet_respond'))

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Assignment 2, master=local[*]) created by getOrCreate at <ipython-input-3-0516e3b5d215>:1 

In [29]:
test.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-27-60d6fa5b7a59>", line 2, in apply_w2v
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/__init__.py", line 114, in wrapper
    return func(self, **kwargs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/feature.py", line 4733, in __init__
    self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/wrapper.py", line 62, in _new_java_obj
    java_obj = _jvm()
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/ml/util.py", line 38, in _jvm
    raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")
AttributeError: Cannot load _jvm from SparkContext. Is SparkContext initialized?


In [242]:
selected_id = 202170318
compare_vector = test.where(f'user_id = {selected_id}')
compare_vector = compare_vector.first()['vector']

TypeError: 'NoneType' object is not subscriptable

In [None]:
from pyspark.mllib.feature import Word2Vec

word2vec = Word2Vec()
model = word2vec.fit(inp)

synonyms = model.findSynonyms('1', 5)


In [268]:
test

DataFrame[user_id: bigint, agg_tweets: string, agg_retweet_users: string, agg_retweets: string, agg_reply_users: string, agg_replies: string, agg_tweet_respond: string, vectors: array<string>, CosineSim: array<string>]

In [269]:
test.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-228-07a1b5cd5b60>", line 3, in cos_sim_w2v
  File "<__array_function__ internals>", line 5, in dot
TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'


In [None]:
test = test.withColumn("CosineSim",cos_function_w2v('vector'))

In [214]:
cos_sim(test.select('vector'),compare_vector)

AttributeError: 'DataFrame' object has no attribute 'dot'

In [98]:
model.getVectors().filter(model.getVectors().word == '1390027514332991489').show(truncate=False)

+-------------------+---------------------------------------------------------------------------------------------------------+
|word               |vector                                                                                                   |
+-------------------+---------------------------------------------------------------------------------------------------------+
|1390027514332991489|[-0.0046485066413879395,0.078712597489357,-0.09231849759817123,-0.02973146364092827,-0.03397924825549126]|
+-------------------+---------------------------------------------------------------------------------------------------------+



In [86]:
compare_vector[0]

'1390027514332991489'

In [99]:
model.findSynonymsArray(compare_vector,5)

[('1390033532995751939', 0.9623203873634338),
 ('1390037786573508608', 0.8075522184371948),
 ('1390039341431361539', 0.738990306854248),
 ('1390048625997357056', 0.712664008140564),
 ('1390039279422910471', 0.6842443943023682)]

In [181]:
tweets_vectors.show(truncate=False)

+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+---------------+-------------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|user_id  |agg_tweets                                                                                         |agg_retweet_users                             |agg_retweets                                                                                       |agg_reply_users|agg_replies        |agg_tweet_respond                                                                                  |vectors                                                                                              |
+-----

In [83]:
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
word2Vec.setMaxIter(10)
word2Vec.getMaxIter()
word2Vec.clear(word2Vec.maxIter)
model = word2Vec.fit(doc)
model.getMinCount()
model.setInputCol("sentence")
model.getVectors().show()


model.findSynonymsArray("a", 2)
from pyspark.sql.functions import format_number as fmt
model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()

+----+--------------------+
|word|              vector|
+----+--------------------+
|   a|[0.09511678665876...|
|   b|[-1.2028766870498...|
|   c|[0.30153277516365...|
+----+--------------------+

+----+----------+
|word|similarity|
+----+----------+
|   b|   0.01586|
|   c|  -0.56808|
+----+----------+



In [84]:
model.findSynonymsArray()

TypeError: findSynonymsArray() missing 2 required positional arguments: 'word' and 'num'

In [271]:
# imports we'll need
import numpy as np
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *

# function to generate a random Spark dense vector
def random_dense_vector(length=10):
    return Vectors.dense([float(np.random.random()) for i in range(length)])

# create a random static dense vector
static_vector = random_dense_vector()

# create a random DF with dense vectors in column
df = spark.createDataFrame([[random_dense_vector()] for x in range(10)], ["myCol"])
df.limit(3).toPandas()

# write our UDF for cosine similarity
def cos_sim(a,b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

# apply the UDF to the column
df = df.withColumn("coSim", udf(cos_sim, FloatType())(col("myCol"), array([lit(v) for v in static_vector])))
df.limit(10).toPandas()

Unnamed: 0,myCol,coSim
0,"[0.7405392724490619, 0.8125921027441317, 0.206...",0.681383
1,"[0.8263547949464041, 0.6992740140918257, 0.148...",0.615073
2,"[0.38942154605217305, 0.6725820219955552, 0.13...",0.829424
3,"[0.5405191700628009, 0.7358844818469981, 0.721...",0.71323
4,"[0.38648107561266953, 0.4678309822785245, 0.78...",0.84857
5,"[0.6461375360829296, 0.8339462255566216, 0.490...",0.627346
6,"[0.3755546423826457, 0.4375009033824063, 0.823...",0.908024
7,"[0.6071775077583399, 0.801368943313297, 0.6150...",0.836216
8,"[0.9774961997948872, 0.421262444193812, 0.8469...",0.694931
9,"[0.9683114926983347, 0.5329209586422137, 0.454...",0.750911


In [273]:
df.show()

+--------------------+----------+
|               myCol|     coSim|
+--------------------+----------+
|[0.74053927244906...|0.68138266|
|[0.82635479494640...| 0.6150727|
|[0.38942154605217...| 0.8294242|
|[0.54051917006280...| 0.7132299|
|[0.38648107561266...| 0.8485704|
|[0.64613753608292...|0.62734604|
|[0.37555464238264...|0.90802354|
|[0.60717750775833...| 0.8362161|
|[0.97749619979488...| 0.6949311|
|[0.96831149269833...| 0.7509105|
+--------------------+----------+



In [257]:
array(tweets_vectors.select('vectors'))

AttributeError: 'DataFrame' object has no attribute '_get_object_id'

In [242]:
tweets_user_filtered.select('vectors').show()

+--------------------+
|             vectors|
+--------------------+
|[1390071341894750...|
+--------------------+



In [243]:
tweets_vectors.show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/Users/christiansullivan/opt/anaconda3/lib/python3.8/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-239-8f41def54240>", line 2, in cos_sim
  File "<__array_function__ internals>", line 5, in dot
ValueError: data type must provide an itemsize


In [None]:
202170318

In [231]:
tweets_vectors.show()

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|   agg_tweet_respond|             vectors|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+--------------------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|           null|               null| 1390027514332991489|[1390027514332991...|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|           null|               null| 1390023742194061312|[1390023742194061...|
| 30616018| 1390052446920220675|           931571402| 1390026843068239874|           null|               null| 1390026843068239874|[1390026843068239...|
| 32947971| 1390058861415997442|              807095| 1390027514332991489|        

In [198]:
tweets_processed.where('agg_reply_users is not null and agg_retweet_users is not null').show(truncate=False)

+-------------------+-------------------------------------------------------------------------------+------------------+---------------------------------------+-------------------+---------------------------------------+----------------------------------+
|user_id            |agg_tweets                                                                     |agg_retweet_users |agg_retweets                           |agg_reply_users    |agg_replies                            |agg_tweet_respond_users           |
+-------------------+-------------------------------------------------------------------------------+------------------+---------------------------------------+-------------------+---------------------------------------+----------------------------------+
|16797082           |1390047886830968841,1390037747751137282                                        |947605221416538112|1390046356732104709                    |362105466          |1390037208447471620                    |947605221416

In [None]:
tweets_processed

In [127]:
tokenizer = Tokenizer(inputCol='agg_retweet_users',
    outputCol="words")
tweets_vectors = tokenizer.transform(tweets_agg)

In [128]:
tweets_vectors.show()

+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+
|  user_id|          agg_tweets|   agg_retweet_users|        agg_retweets|agg_reply_users|        agg_replies|               words|
+---------+--------------------+--------------------+--------------------+---------------+-------------------+--------------------+
| 15466159| 1390054010502225922|              807095| 1390027514332991489|               |                   |            [807095]|
| 19652471| 1390061372604514304|            26574283| 1390023742194061312|               |                   |          [26574283]|
| 30616018| 1390052446920220675|           931571402| 1390026843068239874|               |                   |         [931571402]|
| 32947971| 1390058861415997442|              807095| 1390027514332991489|               |                   |            [807095]|
| 33868781| 1390071486094905344|                    |                    |  

In [100]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="tf")
tf = hashingTF.transform(tweets_vectors)

idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
tfidf = idf.transform(tf)

In [103]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="feature", outputCol="norm")
data = normalizer.transform(tfidf)

In [105]:
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
mat = IndexedRowMatrix(
    data.select("user_id", "norm")\
        .rdd.map(lambda row: IndexedRow(row.user_id, row.norm.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

KeyboardInterrupt: 

In [102]:
tfidf.show(truncate = False)

+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------+---------------+-------------------+------------------------------------------------+-----------------------+--------------------------------------+
|user_id  |agg_tweets                                                                                         |agg_retweet_users                             |agg_retweets                                                                                       |agg_reply_users|agg_replies        |words                                           |tf                     |feature                               |
+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------+----------------------------

In [52]:
tweeets_data.select(col('user_mentions').cast('string')).show()

+--------------------+
|       user_mentions|
+--------------------+
| [{807095, [3, 11]}]|
|[{380648579, [3, ...|
|[{191807697, [3, ...|
|[{15115280, [3, 1...|
| [{807095, [3, 11]}]|
|[{20402945, [3, 8...|
|[{26574283, [3, 1...|
|                null|
| [{807095, [3, 11]}]|
|                null|
| [{807095, [3, 11]}]|
|[{15115280, [3, 1...|
|[{26574283, [3, 1...|
|[{36326893, [3, 1...|
|[{26574283, [3, 1...|
|[{3094649957, [3,...|
|[{846411464885747...|
|[{15250661, [119,...|
|[{7788062, [3, 11]}]|
|[{3094649957, [3,...|
+--------------------+
only showing top 20 rows



In [None]:
tweeets_data

In [48]:
tokenizer = Tokenizer(inputCol='user_mentions',
    outputCol="words")
tweets_vectors = tokenizer.transform(tweeets_data)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(tweets_vectors)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select('features').show()

IllegalArgumentException: requirement failed: Input type must be string type but got array<struct<id:bigint,indices:array<bigint>>>.

In [49]:
assembler = VectorAssembler(inputCols=tweeets_data.columns,
    outputCol="features")
tweets_vectors = assembler.transform(tweeets_data).select("features")
tweets_vectors.show(2)

IllegalArgumentException: Data type string of column created_at is not supported.
Data type array<struct<indices:array<bigint>,text:string>> of column hash_tags is not supported.
Data type string of column text is not supported.
Data type array<struct<id:bigint,indices:array<bigint>>> of column user_mentions is not supported.

In [191]:
tweets_processed = tweets_agg.select('*', 
                                     when((tweets_agg.agg_retweet_users.isNull()) & (length(tweets_agg.agg_reply_users) > 0), 
                                          concat(lit(''),tweets_agg.agg_reply_users))\
                                     .when((tweets_agg.agg_retweet_users.isNotNull()) & (length(tweets_agg.agg_reply_users) == 0), 
                                          concat(lit(''),tweets_agg.agg_retweet_users))\
                                     .otherwise(concat(tweets_agg.agg_retweet_users,tweets_agg.agg_reply_users))
                                     .alias('agg_tweet_respond_users'))