### Graph

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf, lit
from pyspark.sql.types import IntegerType
from pyspark import SQLContext
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer
from pyspark.sql import SparkSession

sqlContext = SQLContext(sc)
spark = SparkSession \
    .builder \
    .appName("graph") \
    .getOrCreate()

In [3]:
# dataframe1 = sqlContext.read.json('hdfs://localhost:1234/user/tl2861/hw3/train.json')
df3 = sqlContext.read.json('../../data/AA/wiki_*')
df3.printSchema()

root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [4]:
# df3.createOrReplaceTempView("wikinews")
# sqlDF3 = spark.sql("select count(*) from wikinews")
# sqlDF3.show()

In [5]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
tokenizedData = regexTokenizer.transform(df3)
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filteredData = stopWordsRemover.transform(tokenizedData)
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
featurizedData = hashingTF.transform(filteredData)
idf= IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
data1 = idfModel.transform(featurizedData)
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(data1)
data = data.sample(False, 0.2, 0)

In [6]:
# data1.show()
data.count()

4381

In [7]:
import numpy as np
feature = np.array(data.select('norm').collect())
# delete the 1 size axis
feature = np.squeeze(feature)
print(feature)

[[0.29313942 0.12286993 0.11253153 ... 0.30024093 0.30916789 0.29102504]
 [0.25695014 0.15955723 0.2045847  ... 0.2079407  0.24088874 0.12957297]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.15396869 0.24585243 0.15761632 ... 0.12015139 0.18558571 0.12478215]
 [0.34789535 0.19442797 0.17806861 ... 0.1385702  0.11007534 0.41939731]
 [0.37162815 0.25961438 0.09510807 ... 0.16916938 0.19597414 0.42165444]]


In [8]:
print(feature)

[[0.29313942 0.12286993 0.11253153 ... 0.30024093 0.30916789 0.29102504]
 [0.25695014 0.15955723 0.2045847  ... 0.2079407  0.24088874 0.12957297]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.15396869 0.24585243 0.15761632 ... 0.12015139 0.18558571 0.12478215]
 [0.34789535 0.19442797 0.17806861 ... 0.1385702  0.11007534 0.41939731]
 [0.37162815 0.25961438 0.09510807 ... 0.16916938 0.19597414 0.42165444]]


In [9]:
similarity = np.dot(feature, feature.T)

In [10]:
print(similarity)

[[1.         0.75924657 0.14473213 ... 0.7827979  0.78997464 0.82221378]
 [0.75924657 1.         0.37213542 ... 0.82217253 0.83299421 0.8331656 ]
 [0.14473213 0.37213542 1.         ... 0.05212743 0.24047359 0.33027206]
 ...
 [0.7827979  0.82217253 0.05212743 ... 1.         0.82863578 0.71298453]
 [0.78997464 0.83299421 0.24047359 ... 0.82863578 1.         0.86095263]
 [0.82221378 0.8331656  0.33027206 ... 0.71298453 0.86095263 1.        ]]


In [11]:
import pandas as pd
df_simi = pd.DataFrame(similarity[:,:])
display(df_simi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4371,4372,4373,4374,4375,4376,4377,4378,4379,4380
0,1.000000,0.759247,0.144732,0.846659,0.815170,0.868381,0.797274,0.874680,0.837733,0.718124,...,0.868419,0.773573,0.873615,0.848951,0.775661,0.793298,0.791183,0.782798,0.789975,0.822214
1,0.759247,1.000000,0.372135,0.868236,0.836681,0.840412,0.802626,0.807466,0.806171,0.739824,...,0.807354,0.842742,0.838713,0.823468,0.830315,0.913437,0.819932,0.822173,0.832994,0.833166
2,0.144732,0.372135,1.000000,0.282116,0.399806,0.180409,0.172982,0.277152,0.136045,0.245661,...,0.188525,0.237877,0.049633,0.070740,0.154470,0.135855,0.133318,0.052127,0.240474,0.330272
3,0.846659,0.868236,0.282116,1.000000,0.798356,0.827327,0.761359,0.851359,0.804365,0.763619,...,0.819097,0.762400,0.874849,0.851791,0.848623,0.822213,0.792446,0.828079,0.884410,0.811811
4,0.815170,0.836681,0.399806,0.798356,1.000000,0.836582,0.839750,0.840280,0.818905,0.782594,...,0.793909,0.683567,0.748721,0.838761,0.755696,0.823009,0.777358,0.777451,0.808041,0.811224
5,0.868381,0.840412,0.180409,0.827327,0.836582,1.000000,0.790675,0.878366,0.878086,0.757661,...,0.882400,0.738592,0.888397,0.851913,0.864455,0.898711,0.816614,0.822982,0.797851,0.822605
6,0.797274,0.802626,0.172982,0.761359,0.839750,0.790675,1.000000,0.851968,0.812083,0.755722,...,0.784025,0.707102,0.751801,0.785789,0.837975,0.832671,0.726030,0.838062,0.872038,0.887092
7,0.874680,0.807466,0.277152,0.851359,0.840280,0.878366,0.851968,1.000000,0.806316,0.748136,...,0.861662,0.765892,0.827233,0.774315,0.887335,0.818866,0.805765,0.767350,0.875446,0.884813
8,0.837733,0.806171,0.136045,0.804365,0.818905,0.878086,0.812083,0.806316,1.000000,0.840926,...,0.815466,0.771226,0.842634,0.880066,0.786948,0.810996,0.752734,0.794528,0.790092,0.874678
9,0.718124,0.739824,0.245661,0.763619,0.782594,0.757661,0.755722,0.748136,0.840926,1.000000,...,0.773428,0.747357,0.775097,0.815680,0.769788,0.781545,0.777792,0.703992,0.808531,0.728144


In [15]:
node = df3.select("title","url").toPandas()
node.insert(0,"id",node.index)
display(node)
# save node
node.to_csv("node.csv", sep=",",index = False, encoding="utf-8")

Unnamed: 0,id,title,url
0,0,BP report into Gulf of Mexico disaster lays bl...,https://en.wikinews.org/wiki?curid=204805
1,1,200 candles: Chileans celebrate country's Bice...,https://en.wikinews.org/wiki?curid=204818
2,2,New flotilla planned to set sail for Gaza Strip,https://en.wikinews.org/wiki?curid=204838
3,3,"Large gas main explodes in San Bruno, Californ...",https://en.wikinews.org/wiki?curid=205039
4,4,Air Zimbabwe pilots 'face dismissal' over pay ...,https://en.wikinews.org/wiki?curid=205165
5,5,Police may have killed some of the eight touri...,https://en.wikinews.org/wiki?curid=205171
6,6,Test article for dev use,https://en.wikinews.org/wiki?curid=205326
7,7,Another test article for the devs,https://en.wikinews.org/wiki?curid=205328
8,8,Japanese motorcylist Shoya Tomizawa dies aged ...,https://en.wikinews.org/wiki?curid=205340
9,9,"Up to ten reported dead, 50 injured after pipe...",https://en.wikinews.org/wiki?curid=205345


In [16]:
edge = df_simi.stack().reset_index()
edge.columns = ['src','dst','similarity']
display(edge)
edge = edge.loc[edge['similarity'] > 0.9]
edge = edge.loc[edge['src'] != edge['dst']]
display(edge)
edge.to_csv("edge.csv",sep = ",", index = False, encoding = "utf-8")

Unnamed: 0,src,dst,similarity
0,0,0,1.000000
1,0,1,0.759247
2,0,2,0.144732
3,0,3,0.846659
4,0,4,0.815170
5,0,5,0.868381
6,0,6,0.797274
7,0,7,0.874680
8,0,8,0.837733
9,0,9,0.718124


Unnamed: 0,src,dst,similarity
25,0,25,0.909871
69,0,69,0.910457
127,0,127,0.902974
130,0,130,0.901375
134,0,134,0.927580
165,0,165,0.906115
181,0,181,0.902831
308,0,308,0.906338
350,0,350,0.922003
392,0,392,0.913815
