In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import explode
from pyspark.sql import functions as F



from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer, IDF
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer


import sys
import numpy as np
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import os.path
%matplotlib inline
pd.set_option('precision',10)

import json
import time
#import simplejson as json
from datetime import datetime
from operator import add

In [2]:
## Helper function to keep track the run time of a spark ops.

def getTime(start):
    sec = time.time() - start
    m, s = divmod(sec, 60)
    h, m = divmod(m, 60)
    print('Spark operation takes - %d:%02d:%02d which is %d seconds in total' % (h,m,s,sec))

# Reading DATA

In [3]:
loading = time.time()
# full
#data_raw = sc.textFile('/mnt/66e695cd-1a0c-4e3b-9a50-55e01b788529/Tweet_Output/Sample')
data_raw = sc.textFile('/mnt/66e695cd-1a0c-4e3b-9a50-55e01b788529/Tweet_Output/small_sample')

data = data_raw.map(lambda line: json.loads(line))  ## change encoding next time!
sample = data.take(1)

getTime(loading)

Spark operation takes - 0:00:01 which is 1 seconds in total


In [4]:
sample

[{u'HashTag_Birthday': 1359737884.0,
  u'from_id': 87151732,
  u'from_user': u'ishiPTI',
  u'hashtag': u'',
  u'location': u'loc_dha_lahore_cantt_',
  u'mention': u'BushraShekhani',
  u'term': u'bushrashekhani yeh ab apke students hi sach bta skte hain p',
  u'tweet_id': 297312861586325504}]

# Turning to dataframe

In [5]:
## Define Dataframe schema.

loading = time.time()
schema = StructType([StructField('HashTag_Birthday', DoubleType(), False),
                     StructField('from_id', IntegerType(), False),
                     StructField('from_user', StringType(), False),
                     StructField('hashtag', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('mention', StringType(), True),
                     StructField('term', StringType(), True),
                     StructField('tweet_id', StringType(), True)                     
                    ])
df = sqlContext.createDataFrame(data, schema)
getTime(loading)

Spark operation takes - 0:00:03 which is 3 seconds in total


In [6]:
df.show(5)

+----------------+---------+----------+-------+--------------------+--------------+--------------------+------------------+
|HashTag_Birthday|  from_id| from_user|hashtag|            location|       mention|                term|          tweet_id|
+----------------+---------+----------+-------+--------------------+--------------+--------------------+------------------+
|   1.359737884E9| 87151732|   ishiPTI|       |loc_dha_lahore_ca...|BushraShekhani|bushrashekhani ye...|297312861586325504|
|   1.359737885E9|945063858|F5Everyday|       |  loc_brandon_ms_usa|       uziha06|rt uziha06 a man’...|297312865747083264|
|   1.359737885E9|231813876|     k30ji|       |            loc_kmkn|     belongfr7|    belongfr7 belong|297312865747103745|
|   1.359737885E9|404571997|0infelnity|       |                loc_|  konnichimaru|        konnichimaru|297312865751281664|
|   1.359737885E9|377993505|   nftnhni|       |      loc_singapore_|      asmooday|rt asmooday you w...|297312865772240896|
+-------

# Temperal Split

In [7]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import explode
from pyspark.sql import functions as F


term_tokenizer = Tokenizer(inputCol="hashtag", outputCol="each_hashtag")
hashtags_df = term_tokenizer.transform(df)

hashtag =  hashtags_df.select("tweet_id","HashTag_Birthday","each_hashtag")
hash_exploded = hashtag.withColumn('each_hashtag', explode('each_hashtag'))

In [8]:
## Define topical hashtag list
title = sqlContext.createDataFrame(\
[("soccer",1,["princessandgino","migikahitnaatkahitpa","royalmigiending","royalmigiendgame","thankyoumikayandgino","asahi","jipped","news","litbus_anime","ff","mink","lol","happybdayharrystyles","gameinsight","androidgames","teamheat","teamnosleep","android","supportlocalband"])],["topics","topical","hashtags"]\
                                  )

In [9]:
## Join hashtag DF with the original DF to obtain all topical tweets for a particular topic

title_exploded = title.withColumn('hashtags', explode('hashtags'))

Hashtag_set = hash_exploded.join(title_exploded,\
                                 hash_exploded.each_hashtag == title_exploded.hashtags,\
                                 "right").select(hash_exploded.tweet_id,\
                                                 hash_exploded.HashTag_Birthday,\
                                                 hash_exploded.each_hashtag)
## Right join to obtain all topical tweets.

In [10]:
loading = time.time()
Hashtag_set.show(3)
getTime(loading)

+------------------+----------------+------------+
|          tweet_id|HashTag_Birthday|each_hashtag|
+------------------+----------------+------------+
|297312958034350080|   1.359737907E9|     android|
|297312995808276480|   1.359737916E9|     android|
|297313004188467200|   1.359737918E9|     android|
+------------------+----------------+------------+
only showing top 3 rows

Spark operation takes - 0:01:54 which is 114 seconds in total


In [11]:

## Find out the "birthday", or the earliest appearing time of each hashtag. 
## (add an extra column of 1 to mark as topical, will be used in a join later)
Ordered_Hashtag_set = Hashtag_set.\
                      groupby("each_hashtag").\
                      agg({"Hashtag_Birthday": "min"}).\
                      orderBy('min(Hashtag_Birthday)', ascending=True).\
                      withColumn("topical", lit(1))


In [None]:
#forget about monotonic id, stick with pandas for small dataframe

In [12]:
## Find the total lenth of topical tweets.
loading = time.time()
time_span = Ordered_Hashtag_set.count()
getTime(loading)

Spark operation takes - 0:01:56 which is 116 seconds in total


In [13]:
# Get id of the corresponding time split (50% and 60%).

train_val_split_Ht = np.floor(np.multiply(time_span, 0.5)).astype(int)
val_test_split_Ht =  np.floor(np.multiply(time_span, 0.6)).astype(int)

In [14]:
# Converting to Pandas for random row access.

pd_Ordered_Hashtag_set = Ordered_Hashtag_set.toPandas()

In [15]:
# locate the timestamp of te 50% and 60% cutoff point. Will be used later to divide D.

train_val_time = pd_Ordered_Hashtag_set.iloc[train_val_split_Ht]['min(Hashtag_Birthday)']
val_test_time = pd_Ordered_Hashtag_set.iloc[val_test_split_Ht]['min(Hashtag_Birthday)']

In [16]:
# Split Hashtags into H_train, H_valid, H_test

train_hashtags = Ordered_Hashtag_set.select("each_hashtag", "topical").\
                                     where(col("min(Hashtag_Birthday)") <= train_val_time)
    
valid_hashtags = Ordered_Hashtag_set.select("each_hashtag", "topical").\
                                     where((col("min(Hashtag_Birthday)") > train_val_time) & (col("min(Hashtag_Birthday)") <= val_test_time))
    
test_hashtags = Ordered_Hashtag_set.select("each_hashtag", "topical").\
                                     where(col("min(Hashtag_Birthday)") > val_test_time )

In [None]:

loading = time.time()
### create dataframe of hashtag, inner join .
topical = ["gameinsight","androidgames","teamheat","teamnosleep","android","supportlocalband"]
Target = hash_exploded.select(hash_exploded.id, F.when(hash_exploded.each_hashtag.isin(topical), 1).otherwise(0).alias("relavant"))
topics = Target.groupBy(Target.id).agg(F.sum(Target.relavant).alias("topic_count")).orderBy("id")
Y = topics.select(topics.id, F.when(topics.topic_count > 0, 1).otherwise(0).alias("topical"))
Y.show(5)
getTime(loading)

In [None]:
#Dpreciated

Training = Train.join(Y, Train.id == Y.id, "inner"\
                     ).drop("id").select("from_user","hashtag","location",\
                                         "mention","term","topical").where(Y.topical == 1)

In [None]:
#Dpreciated

Training_v2 = Train.join(Y, Train.id == Y.id, "inner"\
                     ).drop("id").select("from_user","hashtag","location",\
                                         "mention","term","topical").where(Y.topical == 0).limit(3000)

In [None]:

Training = Train.join(Y, Train.id == Y.id, "inner"\
                     ).drop("id","HashTag_Birthday","from_id").select("from_user","hashtag","location",\
                                         "mention","term","topical").sampleBy("topical", fractions={0: 0.1, 1: 0.1}, seed=0)

In [None]:

Training = Train.join(Y, Train.id == Y.id, "inner"\
                     ).drop("id").drop("HashTag_Birthday").drop("from_id")

In [None]:
loading = time.time()

#Training.show(5)
Training_v2.show(5)
#result = Training.unionAll(Training_v2)
#result.show(5)
getTime(loading)

In [None]:
Training.count()

# Vectorizing user, hashtag, location, mention, term

In [17]:
term_tokenizer = Tokenizer(inputCol="term", outputCol="words")
term_remover = StopWordsRemover(inputCol=term_tokenizer.getOutputCol(), outputCol="filtered")
term_cv = CountVectorizer(inputCol=term_remover.getOutputCol(), outputCol="term_features", vocabSize=500, minTF= 2, minDF=1)
#term_pipeline = Pipeline(stages=[term_tokenizer, term_remover,term_cv])


hashtag_tokenizer = Tokenizer(inputCol="hashtag", outputCol="tags")
hashtag_cv = CountVectorizer(inputCol=hashtag_tokenizer.getOutputCol(), outputCol="hashtag_features", vocabSize=500, minTF = 2, minDF=1)
#hashtag_pipeline = Pipeline(stages=[hashtag_tokenizer,hashtag_cv])

mention_tokenizer = Tokenizer(inputCol="mention", outputCol="mentions")
mention_cv = CountVectorizer(inputCol=mention_tokenizer.getOutputCol(), outputCol="mention_features", vocabSize=100, minTF= 2, minDF=1)
#mention_pipeline = Pipeline(stages=[mention_tokenizer, mention_cv])

user_tokenizer = Tokenizer(inputCol="from_user", outputCol="users")
user_cv = CountVectorizer(inputCol=user_tokenizer.getOutputCol(), outputCol="user_features", vocabSize=100, minTF= 2, minDF=1)

loc_tokenizer = Tokenizer(inputCol="location", outputCol="locs")
loc_cv = CountVectorizer(inputCol=loc_tokenizer.getOutputCol(), outputCol="loc_features", vocabSize=100, minTF= 2, minDF=1)

pipeline = Pipeline(stages=[term_tokenizer,term_remover,term_cv,hashtag_tokenizer,hashtag_cv,mention_tokenizer, \
                            mention_cv,user_tokenizer, user_cv, loc_tokenizer, loc_cv])

In [18]:
loading = time.time()

model = pipeline.fit(df)
Train_X = model.transform(df)

getTime(loading)


Spark operation takes - 0:11:20 which is 680 seconds in total


In [19]:
feat = Train_X.select("tweet_id","term_features","hashtag_features","mention_features","user_features","loc_features")

In [20]:
feat.show(5)

+------------------+-------------+----------------+----------------+-------------+------------+
|          tweet_id|term_features|hashtag_features|mention_features|user_features|loc_features|
+------------------+-------------+----------------+----------------+-------------+------------+
|297312861586325504|  (500,[],[])|     (500,[],[])|     (100,[],[])|  (100,[],[])| (100,[],[])|
|297312865747083264|  (500,[],[])|     (500,[],[])|     (100,[],[])|  (100,[],[])| (100,[],[])|
|297312865747103745|  (500,[],[])|     (500,[],[])|     (100,[],[])|  (100,[],[])| (100,[],[])|
|297312865751281664|  (500,[],[])|     (500,[],[])|     (100,[],[])|  (100,[],[])| (100,[],[])|
|297312865772240896|  (500,[],[])|     (500,[],[])|     (100,[],[])|  (100,[],[])| (100,[],[])|
+------------------+-------------+----------------+----------------+-------------+------------+
only showing top 5 rows



In [21]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ["term_features","hashtag_features","mention_features","user_features","loc_features"], outputCol="features")
transformed = assembler.transform(Train_X).select("tweet_id","features","HashTag_Birthday")

# Train valid Test split

In [22]:
Train_ids = train_hashtags.join(Hashtag_set,\
                                 train_hashtags.each_hashtag == Hashtag_set.each_hashtag,\
                                 "inner").select(Hashtag_set.tweet_id,\
                                                 Hashtag_set.HashTag_Birthday,\
                                                 train_hashtags.topical)
## Right join to obtain all topical tweets.
Train_ids.show(4)

+------------------+----------------+-------+
|          tweet_id|HashTag_Birthday|topical|
+------------------+----------------+-------+
|297312958034350080|   1.359737907E9|      1|
|297312995808276480|   1.359737916E9|      1|
|297313004188467200|   1.359737918E9|      1|
|297313272619732995|   1.359737982E9|      1|
+------------------+----------------+-------+
only showing top 4 rows



In [24]:
Training_set = transformed.select("tweet_id","features").where(col("HashTag_Birthday") <= train_val_time)

Training_set_labled = Training_set.join(Train_ids, Training_set.tweet_id == Train_ids.tweet_id, "left").\
                           drop("tweet_id").\
                           select(Training_set.features, F.when(Train_ids.topical == 1, 1).otherwise(0).alias("topical"))

In [None]:
Training_set_labled.show(10)

In [None]:
### Depreciated!

Train_ids = hash_exploded.join(train_hashtags,\
                                 hash_exploded.each_hashtag == train_hashtags.each_hashtag,\
                                 "right").select(hash_exploded.id,\
                                                 title_exploded.topical)

Training = transformed.join(Train_ids, Train.tweet_id == Y.tweet_id, "inner"\
                     ).drop("id").select("from_user","hashtag","location",\
                                         "mention","term","topical").where(Y.topical == 1)


In [None]:
## Depreciated!

Training = transformed.join(Train_ids, Train.id == Y.id, "inner"\
                     ).drop("id").select("from_user","hashtag","location",\
                                         "mention","term","topical").where(Y.topical == 1)

In [None]:
train_count = df.groupby("hashtag").agg({"Hashtag_Birthday": "min"}).orderBy('min(Hashtag_Birthday)', ascending=True) #orderBy('HashTag_Birthday', ascending=True)

In [None]:
Hash_birth = train_count.select(col("hashtag"), col("min(Hashtag_Birthday)").cast(DecimalType(11)))

In [None]:
Hash_birth.count()

In [None]:
>>> df.agg({"age": "max"}).collect()
[Row(max(age)=5)]
>>> from pyspark.sql import functions as F
>>> df.agg(F.min(df.age)).collect()
[Row(min(age)=2)]

In [None]:
sorting = df.select("HashTag_Birthday").distinct().orderBy('HashTag_Birthday', ascending=True).withColumn("id", monotonically_increasing_id())

In [None]:
splitpoint = sorting.select("HashTag_Birthday").where(sorting.id == 1000)

In [None]:
splitpoint.show()

In [None]:
Spark operation takes - 0:02:29 which is 149 seconds in total for 1 day
Spark operation takes - 0:10:54 which is 654 seconds in total for 1 day full

Spark operation takes - 0:03:27 which is 207 seconds in total for 4 days
Spark operation takes - 0:34:52 which is 2092 seconds in total for 28 dyas

# Train logistic regression

In [None]:
from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg

def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return mllib_linalg.DenseVector(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))

In [None]:
from pyspark.sql import Row
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel


TrainingRDD=transformed.rdd.map(lambda row: LabeledPoint(row.topical, as_old(row.features)))
trainRDD,validRDD,testRDD=TrainingRDD.randomSplit([0.7,0.2,0.1])


In [None]:
trainRDD.take(3)

In [None]:
model = LogisticRegressionWithLBFGS.train(trainRDD)

# Evaluation

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

# Compute raw scores on the test set
predictionAndLabels = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

In [None]:
sc.version