In [1]:
import findspark
findspark.init('C:/spark')

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
                          .master("local")\
                          .appName('classifier')\
                          .getOrCreate()
sc=spark.sparkContext

In [3]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
import nltk
from nltk.corpus import stopwords

In [4]:
sqlContext = SQLContext(sc)

In [5]:
fas_df = spark.read.text('Data/Fashion/*')
fas_df = fas_df.withColumn("category",lit("Fashion"))

tech_df = spark.read.text('Data/Technology/*')
tech_df = tech_df.withColumn("category",lit("Technology"))

sci_df = spark.read.text('Data/Science/*')
sci_df = sci_df.withColumn("category",lit("science"))

mov_df = spark.read.text('Data/Movie/*')
mov_df = mov_df.withColumn("category",lit("Movie"))


merge_df1 = fas_df.union(tech_df)
merge_df2 = merge_df1.union(sci_df)
merge_df3 = merge_df2.union(mov_df)

In [6]:
data = merge_df3.select([column for column in merge_df3.columns])


In [7]:
data.show(5)


+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [9]:
Fas_udf = spark.read.text('Data/unknown/Fashion/*')
Fas_udf = Fas_udf.withColumn("category",lit("Fashion"))

science_udf = spark.read.text('Data/unknown/science/*')
science_udf = science_udf.withColumn("category",lit("science"))

tech_udf = spark.read.text('Data/unknown/technology/*')
tech_udf = tech_udf.withColumn("category",lit("technology"))

movie_udf = spark.read.text('Data/unknown/Movie/*')
movie_udf = movie_udf.withColumn("category",lit("Movie"))

merge_udf1 = Fas_udf.union(science_udf)
merge_udf2 = merge_udf1.union(tech_udf)
merge_udf3 = merge_udf2.union(movie_udf)

unknown_data = merge_udf3.select([column for column in merge_udf3.columns])
unknown_data.show(5)

+--------------------+--------+
|               value|category|
+--------------------+--------+
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
|   Sections SEARC...| Fashion|
+--------------------+--------+
only showing top 5 rows



In [9]:
regexTokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W")

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prajw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
add_stopwords=nltk.corpus.stopwords.words('english')
add_stopwords_1 = ["nytimes","com","sense","day","common","business","todays","said","food","review","sunday","letters","politics","events","terms","services","years","contributors","companies","listings","applications","tax","trump","president","contributing","make","think","woman","federal","called","system","found","american","sale","headline","arts","times","subscriptions","choices","privacy","take","jobs","books","account","accounts","television","nyc","writers","multimedia","journeys","editorials","photography","automobiles","paper","city","tool","sports","weddings","columnists","contribution","even","nyt","obituary","state","travel","advertise","pm","street","go","corrections","saturday","company","dance","states","real","movies","estate","percent","music","tech","living","science","fashion","please","opinion","art","new","york","time","u","wa","reading","ha","video","image","photo","credit","edition","magazine","oped","could","crossword","mr","term","feedback","index","get","also","b","help","year","health","united","education","week","think","guide","event","two","first","subscription","service","cut","is","nytimescom","section","sections","Sections","Home","home","Search","search","Skip","skip","content","navigation","View","view","mobile","version","Subscribe","subscribe","Now","now","Log","log","In","in","setting","settings","Site","site","Loading","loading","article","next","previous","Advertisement","ad","advertisement","Supported","supported","by","Share","share","Page","page","Continue","continue","main","story","newsletter","Sign","Up","Manage","email","preferences","Not","you","opt","out","contact","us","anytime","thank","subscribing","see","more","email"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered1").setStopWords(add_stopwords)
stopwordsRemover1 = StopWordsRemover(inputCol="filtered1", outputCol="filtered").setStopWords(add_stopwords_1)


In [12]:
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover,stopwordsRemover1, hashingTF, idf, label_stringIdx])

In [13]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|               value|category|               words|           filtered1|            filtered|         rawFeatures|            features|label|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, biz, like...|(1000,[19,24,26,7...|(1000,[19,24,26,7...|  1.0|
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, hold, sho...|(1000,[15,19,22,7...|(1000,[15,19,22,7...|  1.0|
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, banks, he...|(1000,[0,10,69,75...|(1000,[0,10,69,75...|  1.0|
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, mayor, us...|(1000,[10,19,21,2...|(1000,[10,19,21,2...|  1.0|

In [14]:
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions.show(10)

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9754632623041882,0.01057...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9431747402417925,0.02006...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9172397458630736,0.02471...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9053576601908154,0.02005...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8962780254409616,0.02195...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8507710146790355,0.01021...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8454293725021643,0.05217...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.8197670154180591,0.01462...|  0.0|       0.0|
|   Sectio

In [15]:
pipelineFit2 = pipeline.fit(unknown_data)
unknown_dataset = pipelineFit2.transform(unknown_data)

In [16]:
predictions2 = lrModel.transform(unknown_dataset)
predictions2.filter(predictions2['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions2.show(10)

+-----+--------+-----------+-----+----------+
|value|category|probability|label|prediction|
+-----+--------+-----------+-----+----------+
+-----+--------+-----------+-----+----------+

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|               value|category|               words|           filtered1|            filtered|         rawFeatures|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|   Sections SEARC...| Fashion|[sections, search...|[sections, search...|[today, industry,...|(1000,[10,12,43,5...|(1000,[10,12,43,5...|  3.0|[0.01492090532267...|[0.21386014488012...|       1.0|
|   Sections SEARC...| Fashion|

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
#evaluator.evaluate(predictions)
print("-------Accuracy of test data using logistic_regression-----: " + str(evaluator.evaluate(predictions)*100)+"%")


from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
#evaluator.evaluate(predictions2)
print("-------Accuracy of unknown data using logistic_regression-----: " + str(evaluator.evaluate(predictions2)*100)+"%")

#training the data -- Naive Bayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)


-------Accuracy of test data using logistic_regression-----: 64.6386141204794%
-------Accuracy of unknown data using logistic_regression-----: 13.2128740824393%


In [18]:
predictions3 = model.transform(testData)
predictions3.filter(predictions3['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions3.show(10)

+------------------------------+----------+------------------------------+-----+----------+
|                         value|  category|                   probability|label|prediction|
+------------------------------+----------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Fashion|[1.0,1.7849695897712397E-17...|  1.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,5.286020787858083E-18,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,4.703611074610709E-22,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,6.492341445916501E-27,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,4.248773392621757E-29,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,7.00353830988496E-31,2...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,8.202852029206588E-34,...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|     Movie|[1.0,2.7184097459394804E-39...|  0.0|

In [19]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of test data using naive_bayes-----: " + str(evaluator.evaluate(predictions3)*100)+"%")


-------Accuracy of test data using naive_bayes-----: 58.8554632688887%


In [20]:
predictions4 = model.transform(unknown_dataset)
predictions4.filter(predictions4['prediction'] == 0) \
    .select("value","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions4.show(10)

+------------------------------+--------+------------------------------+-----+----------+
|                         value|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|   Sections SEARCH Skip to ...|   Movie|[0.9572707190900996,0.03799...|  0.0|       0.0|
|   Sections SEARCH Skip to ...|   Movie|[0.9200992619781946,0.03093...|  0.0|       0.0|
|   Sections SEARCH Skip to ...| science|[0.8060914009544183,0.00356...|  1.0|       0.0|
+------------------------------+--------+------------------------------+-----+----------+

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|               value|category|               words|           filtered1|            filtered|         rawFeatures|            features|label|       rawPrediction|         p

In [21]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("-------Accuracy of unknown data using naive_bayes-----: " + str(evaluator.evaluate(predictions4)*100)+"%")

-------Accuracy of unknown data using naive_bayes-----: 31.242077171133104%
