### Graph

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf, lit
from pyspark.sql.types import IntegerType
from pyspark import SQLContext
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer
from pyspark.sql import SparkSession

sqlContext = SQLContext(sc)
spark = SparkSession \
    .builder \
    .appName("graph") \
    .getOrCreate()

In [2]:
# dataframe1 = sqlContext.read.json('hdfs://localhost:1234/user/tl2861/hw3/train.json')
df3 = sqlContext.read.json('../../data/AA/wiki_*')
df3 = df3.sample(False, 0.3, 42)
df3.printSchema()

root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [3]:
# df3.createOrReplaceTempView("wikinews")
# sqlDF3 = spark.sql("select count(*) from wikinews")
# sqlDF3.show()

In [4]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
tokenizedData = regexTokenizer.transform(df3)
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filteredData = stopWordsRemover.transform(tokenizedData)
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
featurizedData = hashingTF.transform(filteredData)
idf= IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
data1 = idfModel.transform(featurizedData)
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(data1)
data = data.sample(False, 0.2, 0)

In [5]:
# data1.show()
data.count()

1303

In [6]:
import numpy as np
feature = np.array(data.select('norm').collect())
# delete the 1 size axis
feature = np.squeeze(feature)
print(feature)

[[0.         0.08541561 0.19625114 ... 0.47727619 0.06370498 0.04417505]
 [0.24487209 0.22203587 0.16324806 ... 0.31902895 0.39743903 0.32152958]
 [0.39469053 0.40082848 0.18418891 ... 0.22854146 0.02135338 0.17768532]
 ...
 [0.32347965 0.16245009 0.1492983  ... 0.10806204 0.26251142 0.30805705]
 [0.08737911 0.28522918 0.34951643 ... 0.26562882 0.07091022 0.14751416]
 [0.32334694 0.07036616 0.06466939 ... 0.11233842 0.1836826  0.25474264]]


In [7]:
print(feature)

[[0.         0.08541561 0.19625114 ... 0.47727619 0.06370498 0.04417505]
 [0.24487209 0.22203587 0.16324806 ... 0.31902895 0.39743903 0.32152958]
 [0.39469053 0.40082848 0.18418891 ... 0.22854146 0.02135338 0.17768532]
 ...
 [0.32347965 0.16245009 0.1492983  ... 0.10806204 0.26251142 0.30805705]
 [0.08737911 0.28522918 0.34951643 ... 0.26562882 0.07091022 0.14751416]
 [0.32334694 0.07036616 0.06466939 ... 0.11233842 0.1836826  0.25474264]]


In [8]:
similarity = np.dot(feature, feature.T)

In [9]:
print(similarity)

[[1.         0.61512723 0.57363609 ... 0.61035521 0.55019456 0.71634007]
 [0.61512723 1.         0.81351014 ... 0.90934705 0.75781345 0.7723351 ]
 [0.57363609 0.81351014 1.         ... 0.7763073  0.72313427 0.69482769]
 ...
 [0.61035521 0.90934705 0.7763073  ... 1.         0.75225772 0.87379146]
 [0.55019456 0.75781345 0.72313427 ... 0.75225772 1.         0.69754186]
 [0.71634007 0.7723351  0.69482769 ... 0.87379146 0.69754186 1.        ]]


In [10]:
import pandas as pd
df_simi = pd.DataFrame(similarity[:,:])
display(df_simi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1293,1294,1295,1296,1297,1298,1299,1300,1301,1302
0,1.000000,0.615127,0.573636,0.719247,0.617574,0.721159,0.653327,0.594321,0.605583,0.624361,...,0.562550,0.804780,0.673529,0.505983,0.549152,0.557301,0.777507,0.610355,0.550195,0.716340
1,0.615127,1.000000,0.813510,0.849005,0.856357,0.626851,0.917166,0.796633,0.814672,0.865663,...,0.876544,0.854368,0.843443,0.735599,0.812885,0.888838,0.928992,0.909347,0.757813,0.772335
2,0.573636,0.813510,1.000000,0.660083,0.820069,0.704701,0.901300,0.817336,0.780569,0.750711,...,0.831436,0.793173,0.842926,0.760074,0.845610,0.799932,0.852319,0.776307,0.723134,0.694828
3,0.719247,0.849005,0.660083,1.000000,0.837863,0.698090,0.815951,0.751154,0.794879,0.839105,...,0.741085,0.807473,0.753754,0.618506,0.695508,0.757023,0.832938,0.856683,0.709638,0.808422
4,0.617574,0.856357,0.820069,0.837863,1.000000,0.701870,0.883835,0.786721,0.757601,0.828855,...,0.831921,0.806332,0.824191,0.664024,0.690799,0.770795,0.869626,0.808214,0.825205,0.672949
5,0.721159,0.626851,0.704701,0.698090,0.701870,1.000000,0.759245,0.666855,0.547077,0.728618,...,0.590289,0.771969,0.773680,0.533730,0.684966,0.587383,0.757903,0.747601,0.566155,0.764876
6,0.653327,0.917166,0.901300,0.815951,0.883835,0.759245,1.000000,0.867920,0.840859,0.909863,...,0.907914,0.902975,0.919982,0.804620,0.835258,0.920652,0.921894,0.908254,0.859110,0.838041
7,0.594321,0.796633,0.817336,0.751154,0.786721,0.666855,0.867920,1.000000,0.804387,0.813344,...,0.726382,0.774277,0.775383,0.609773,0.688596,0.848817,0.827640,0.822892,0.811608,0.740143
8,0.605583,0.814672,0.780569,0.794879,0.757601,0.547077,0.840859,0.804387,1.000000,0.792455,...,0.847388,0.776043,0.706092,0.620840,0.635996,0.866752,0.798685,0.762306,0.715452,0.717794
9,0.624361,0.865663,0.750711,0.839105,0.828855,0.728618,0.909863,0.813344,0.792455,1.000000,...,0.845746,0.851077,0.796083,0.684963,0.759341,0.906050,0.838162,0.897515,0.796821,0.866360


In [11]:
node = df3.select("title","url").toPandas()
node.insert(0,"id",node.index)
display(node)
# save node
node.to_csv("node.csv", sep=",",index = False, encoding="utf-8")

Unnamed: 0,id,title,url
0,0,Test article for dev use,https://en.wikinews.org/wiki?curid=205326
1,1,Another test article for the devs,https://en.wikinews.org/wiki?curid=205328
2,2,Copenhagen hotel explosion may have been terro...,https://en.wikinews.org/wiki?curid=205355
3,3,Nokia appoints Microsoft Business Division Hea...,https://en.wikinews.org/wiki?curid=205528
4,4,Six die in US-Iraqi raid in Fallujah,https://en.wikinews.org/wiki?curid=206241
5,5,Tulsa media erroneously reports San Diego Comi...,https://en.wikinews.org/wiki?curid=206259
6,6,Victoria,https://en.wikinews.org/wiki?curid=206369
7,7,Continental and United Airlines shareholders a...,https://en.wikinews.org/wiki?curid=206718
8,8,Collingwood and St. Kilda draw 2010 AFL Grand ...,https://en.wikinews.org/wiki?curid=207190
9,9,"Leonard Skinner, namesake of rock group Lynyrd...",https://en.wikinews.org/wiki?curid=207432


In [12]:
edge = df_simi.stack().reset_index()
edge.columns = ['src','dst','similarity']
display(edge)
edge = edge.loc[edge['similarity'] > 0.95]
edge = edge.loc[edge['src'] != edge['dst']]
display(edge)
edge.to_csv("edge.csv",sep = ",", index = False, encoding = "utf-8")

Unnamed: 0,src,dst,similarity
0,0,0,1.000000
1,0,1,0.615127
2,0,2,0.573636
3,0,3,0.719247
4,0,4,0.617574
5,0,5,0.721159
6,0,6,0.653327
7,0,7,0.594321
8,0,8,0.605583
9,0,9,0.624361


Unnamed: 0,src,dst,similarity
1583,1,280,0.956902
2300,1,997,0.956527
2539,1,1236,0.956746
7875,6,57,0.962319
7900,6,82,0.966172
7949,6,131,0.960576
8077,6,259,0.957848
8098,6,280,0.966492
8157,6,339,0.956047
8163,6,345,0.956438
