In [1]:
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame, Row
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import desc
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, expr
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
import numpy 

from csv import reader

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

<Font size=5 color=red>Investigate the original dataset (obviously, it cannot be used). Take a look at https://stackoverflow.com/questions/13793529/r-error-invalid-type-list-for-variable to see how useless the Body column information could be!

The point here is that the body information consists mostly of codes and some weird patterns that are not useful for our purpose. The most important information here is the connection between the title of the questions and tags. So, I removed the Body column from the dataset.</Font>

In [None]:
try:
    spark = init_spark()

    filename1 = "./Train.csv"
    df2 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename1, header=True)
    print(df2.count())
    print(df2.show(10))    
except:
    pass

<Font size=5 color=red>For removing the Body column, I read all the dataset once using Pandas library. After that, I removed the column and got an export to have a concrete file as our dataset. This part has been ommited from the notebook.</Font>

In [73]:
spark = init_spark()

filename = "./TrainWithoutBody.csv"
df1 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename, header=True)
df1 = df1.drop("_c0")
df1 = df1.dropna()

rddTags = df1.select("Tags").rdd

df1.count()

+---+--------------------+--------------------+--------------------+
| _1|                  _2|                  _3|                  _4|
+---+--------------------+--------------------+--------------------+
| Id|               Title|                Body|                Tags|
|  1|How to check if a...|<p>I'd like to ch...|php image-process...|
|  2|How can I prevent...|<p>In my favorite...|             firefox|
|  3|R Error Invalid t...|<p>I am import ma...|r matlab machine-...|
|  4|How do I replace ...|<p>This is probab...|     c# url encoding|
+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [69]:
df1.show(5)

+--------------------+--------------------+--------------------+--------------------+
|                  Id|               Title|                Body|                Tags|
+--------------------+--------------------+--------------------+--------------------+
|                   1|How to check if a...|<p>I'd like to ch...|php image-process...|
|                   2|How can I prevent...|<p>In my favorite...|             firefox|
|      rf_model = ...| data=expert_data...|     importance=TRUE|      do.trace=100);|
|   ..$ NA: num [1:74| 1:12] 3 9 3 0 1 ...| 1:12] 5 7 3 30 0...| 1:12] 0 0 13 0 0...|
|   ..$ NA: num [1:75| 1:12] 1 7 0 1 2 ...| 1:12] 10 7 8 15 ...| 1:12] 0 6 3 1 5 ...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



<Font size = 5, color=green>Finding the 100 most used tags (one DT per each most used tag)

In [44]:
splittedTags = rddTags.filter(lambda r: r[0] != None).flatMap(lambda r: r[0].split(" ")).map(lambda r: (r, 1)).reduceByKey(lambda x, y: x + y)

splittedTags = splittedTags.sortBy(lambda r: r[1], False) #Sorted with number of usage (you can collect and see)

splittedTagsSorted = splittedTags.map(lambda r: r[0]) #Delete this line if you want to see number of times they have been used.

# df10 = anSorted.toDF()


mostUsedTags = splittedTagsSorted.collect()[0:50]



KeyboardInterrupt: 

In [6]:
mostUsedTags

['c#',
 'java',
 'php',
 'javascript',
 'android',
 'jquery',
 'c++',
 'python',
 'iphone',
 'asp.net',
 'mysql',
 'html',
 '.net',
 'ios',
 'objective-c',
 'sql',
 'css',
 'linux',
 'ruby-on-rails',
 'windows',
 'c',
 'sql-server',
 'ruby',
 'wpf',
 'xml',
 'ajax',
 'database',
 'regex',
 'windows-7',
 'asp.net-mvc',
 'xcode',
 'django',
 'osx',
 'arrays',
 'vb.net',
 'eclipse',
 'json',
 'facebook',
 'ruby-on-rails-3',
 'ubuntu',
 'performance',
 'networking',
 'string',
 'multithreading',
 'winforms',
 'security',
 'asp.net-mvc-3',
 'visual-studio-2010',
 'bash',
 'homework']

In [99]:
print(rddTags.count())

6017243


In [9]:
rddTags.take(10)

[Row(Tags='php image-processing file-upload upload mime-types'),
 Row(Tags='firefox'),
 Row(Tags='r matlab machine-learning'),
 Row(Tags='c# url encoding'),
 Row(Tags='php api file-get-contents'),
 Row(Tags='proxy active-directory jmeter'),
 Row(Tags='core-plot'),
 Row(Tags='c# asp.net windows-phone-7'),
 Row(Tags='.net javascript code-generation'),
 Row(Tags='sql variables parameters procedure calls')]

<Font size=5.5, color="purpule">Here, I have cleaned the Tags column to only contain the most used tags. For example, I ommited the "upload" tag from first group of tags for the first question, because it's not a most used tag.

In [101]:
def replaceNoneWithString(x):
    if (x == None): return "None"
    else : return x

In [None]:
rrr = rddTags.map(lambda r: r[0]).map(replaceNoneWithString).map(lambda r: r.split(" ")).map(lambda r: [ped for ped in r if ped in mostUsedTags])
cleanedTags = rrr.take(10)
cleanedTags

# Subject titles to TF-IDF

In [103]:
tokenizer = Tokenizer(inputCol="Title", outputCol="transformed_tfidf")
wordsData = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="transformed_tfidf", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

featurizedData.take(1)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [12]:
rescaledData.select("id", "tags", "features").show()
rescaledData.take(1)

+---+--------------------+--------------------+
| id|                tags|            features|
+---+--------------------+--------------------+
|  1|php image-process...|(20,[1,3,4,8,9,10...|
|  2|             firefox|(20,[1,2,3,7,12,1...|
|  3|r matlab machine-...|(20,[2,4,5,6,7,17...|
|  4|     c# url encoding|(20,[3,6,7,10,13,...|
|  5|php api file-get-...|(20,[3,5,8,13,15,...|
|  6|proxy active-dire...|(20,[0,3,4,7,13,1...|
|  7|           core-plot|(20,[3,6,7,8,10,1...|
|  8|c# asp.net window...|(20,[0,3,7,8,10,1...|
|  9|.net javascript c...|(20,[0,1,3,4],[1....|
| 10|sql variables par...|(20,[0,3,6,12,16,...|
| 11|.net obfuscation ...|(20,[0,3,6,8,11,1...|
| 12|algorithm languag...|(20,[0,1,5,19],[1...|
| 13|postfix migration...|(20,[1,3,8,18,19]...|
| 14|documentation lat...|(20,[10,13,16,17,...|
| 15|           windows-7|(20,[1,2,12,13,15...|
| 16|php url-routing c...|(20,[2,4,10,18],[...|
| 17|   r temporary-files|(20,[3,5,8,13,15,...|
| 18|         wpf binding|(20,[0,8,12,13

[Row(Id='1', Title='How to check if an uploaded file is an image without mime type?', Tags='php image-processing file-upload upload mime-types', transformed_tfidf=['how', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'without', 'mime', 'type?'], rawFeatures=SparseVector(20, {1: 1.0, 3: 3.0, 4: 1.0, 8: 2.0, 9: 1.0, 10: 2.0, 12: 2.0, 16: 1.0}), features=SparseVector(20, {1: 1.1357, 3: 1.8792, 4: 1.1753, 8: 1.4617, 9: 1.1974, 10: 2.0765, 12: 2.0979, 16: 0.9553}))]

# Word2Vec

### Utility Functions and Constants

In [145]:
num_labels = 50
classifier_array = []

def my_function(r):
    tags = r.labels
    labels = []
    for t in mostUsedTags:
        if t in tags:
            labels.append(1)
        else:
            labels.append(0)
    return (r.Id, r.Title, r.Tags, r.features, labels)

### Load Data

In [152]:
spark = init_spark()
rdd = spark.sparkContext.textFile("./Train.csv")

w2v_data = rdd.mapPartitions(lambda x: csv.reader(x))

colnames = ['Id']+['Title']+['Body']+['Tags'] 

w2v_data = w2v_data.toDF(colnames)
w2v_data.select([col(c).cast("string") for c in w2v_data.columns])
w2v_data = w2v_data.filter(w2v_data.Id == '1')

w2v_data.show()

# Another example
    

StructType(List(StructField(Id,StringType,true),StructField(Title,StringType,true),StructField(Body,StringType,true),StructField(Tags,StringType,true)))

### Data Featurization, Sampling and Structure

In [149]:
sampled_df = w2v_data.sample(False, 0.00083, seed=42)
sampled_df.show()


TypeError: 'StructType' object is not callable

In [None]:
tokenizer = Tokenizer(inputCol="Title", outputCol="tokenized_text")
tokenized_df = tokenizer.transform(sampled_df)

word2Vec = Word2Vec(inputCol="tokenized_text", outputCol="features", vectorSize=100)
fitted_word2Vec = word2Vec.fit(tokenized_df)

In [33]:
train_df, test_val_df = tokenized_df.randomSplit([.7,.3],seed=1234)
test_df, val_df = test_val_df.randomSplit([.5, .5], seed=1234)

train_df = fitted_word2Vec.transform(train_df)
test_df = fitted_word2Vec.transform(test_df)
val_df = fitted_word2Vec.transform(val_df)

In [22]:
train_df.take(1)

[Row(Id='1003396', Title='Show values of custom field created with ACF on a page', Tags='php custom-field', tokenized_text=['show', 'values', 'of', 'custom', 'field', 'created', 'with', 'acf', 'on', 'a', 'page'], features=DenseVector([-0.006, 0.0165, 0.0094, 0.0057, -0.005, -0.0341, 0.0205, 0.0053, -0.0411, 0.0487, 0.0354, -0.0176, 0.0342, 0.0171, 0.0229, -0.0165, -0.0026, 0.0021, 0.0123, -0.0327, -0.0641, -0.005, 0.0215, 0.0275, -0.0462, 0.0096, 0.006, -0.0434, -0.0325, -0.003, 0.0003, 0.0542, 0.0001, -0.0471, -0.004, -0.014, -0.0304, 0.0454, 0.0259, -0.0076, 0.0203, 0.026, 0.0021, -0.0337, 0.0053, -0.0317, -0.0224, -0.0503, -0.0287, -0.0272, 0.0217, -0.023, -0.0239, -0.0136, 0.011, -0.0, -0.0021, -0.0026, 0.0045, 0.0184, 0.05, -0.024, 0.005, -0.0266, 0.0077, 0.0487, 0.0266, 0.0347, 0.0107, -0.0184, 0.0069, -0.0351, -0.0154, 0.0061, -0.0069, 0.0033, 0.0048, 0.0201, -0.002, -0.0182, 0.021, -0.0109, 0.0147, 0.0252, 0.0099, 0.0104, -0.0092, 0.0124, -0.0152, -0.0287, -0.0105, -0.0677, -0.

### Setting up target Labels on Train Data

In [34]:
target_tags = train_df.withColumn('labels', split(col("Tags")," "))

train_df = target_tags.rdd.map(lambda r: my_function(r))

train_cols = ["Id","Title", "Tags","features", "labels"]

train_df = train_df.toDF(train_cols)

train_df.take(1)

[Row(Id='1003396', Title='Show values of custom field created with ACF on a page', Tags='php custom-field', features=DenseVector([-0.006, 0.0165, 0.0094, 0.0057, -0.005, -0.0341, 0.0205, 0.0053, -0.0411, 0.0487, 0.0354, -0.0176, 0.0342, 0.0171, 0.0229, -0.0165, -0.0026, 0.0021, 0.0123, -0.0327, -0.0641, -0.005, 0.0215, 0.0275, -0.0462, 0.0096, 0.006, -0.0434, -0.0325, -0.003, 0.0003, 0.0542, 0.0001, -0.0471, -0.004, -0.014, -0.0304, 0.0454, 0.0259, -0.0076, 0.0203, 0.026, 0.0021, -0.0337, 0.0053, -0.0317, -0.0224, -0.0503, -0.0287, -0.0272, 0.0217, -0.023, -0.0239, -0.0136, 0.011, -0.0, -0.0021, -0.0026, 0.0045, 0.0184, 0.05, -0.024, 0.005, -0.0266, 0.0077, 0.0487, 0.0266, 0.0347, 0.0107, -0.0184, 0.0069, -0.0351, -0.0154, 0.0061, -0.0069, 0.0033, 0.0048, 0.0201, -0.002, -0.0182, 0.021, -0.0109, 0.0147, 0.0252, 0.0099, 0.0104, -0.0092, 0.0124, -0.0152, -0.0287, -0.0105, -0.0677, -0.0223, -0.0276, 0.079, -0.0432, 0.0051, 0.0224, 0.0429, -0.0407]), labels=[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0

### Seperate Labels into Columns

In [35]:
train_df = train_df.select(['Id']+['Title']+['features']+['labels']+[expr('labels[' + str(x) + ']') for x in range(0, num_labels)])

for i in range(0, num_labels): 
    if '.' in mostUsedTags[i]:
        mostUsedTags[i] = mostUsedTags[i].replace('.', '')

colnames = ['Id']+['Title']+['features']+['labels'] + [str(mostUsedTags[i]) for i in range(0, num_labels)] 
train_df = train_df.toDF(*colnames)

train_df.take(1)

[Row(Id='1003396', Title='Show values of custom field created with ACF on a page', features=DenseVector([-0.006, 0.0165, 0.0094, 0.0057, -0.005, -0.0341, 0.0205, 0.0053, -0.0411, 0.0487, 0.0354, -0.0176, 0.0342, 0.0171, 0.0229, -0.0165, -0.0026, 0.0021, 0.0123, -0.0327, -0.0641, -0.005, 0.0215, 0.0275, -0.0462, 0.0096, 0.006, -0.0434, -0.0325, -0.003, 0.0003, 0.0542, 0.0001, -0.0471, -0.004, -0.014, -0.0304, 0.0454, 0.0259, -0.0076, 0.0203, 0.026, 0.0021, -0.0337, 0.0053, -0.0317, -0.0224, -0.0503, -0.0287, -0.0272, 0.0217, -0.023, -0.0239, -0.0136, 0.011, -0.0, -0.0021, -0.0026, 0.0045, 0.0184, 0.05, -0.024, 0.005, -0.0266, 0.0077, 0.0487, 0.0266, 0.0347, 0.0107, -0.0184, 0.0069, -0.0351, -0.0154, 0.0061, -0.0069, 0.0033, 0.0048, 0.0201, -0.002, -0.0182, 0.021, -0.0109, 0.0147, 0.0252, 0.0099, 0.0104, -0.0092, 0.0124, -0.0152, -0.0287, -0.0105, -0.0677, -0.0223, -0.0276, 0.079, -0.0432, 0.0051, 0.0224, 0.0429, -0.0407]), labels=[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [37]:
for i in range(0, num_labels):
    lr = LogisticRegression(maxIter=100, featuresCol='features', labelCol=str(mostUsedTags[i]), predictionCol='prediction')
    lrModel = lr.fit(train_df)
    classifier_array.append(lrModel)
    print(str(mostUsedTags[i]))


c#
java
php
javascript
android
jquery
c++
python
iphone
aspnet
mysql
html
net
ios
objective-c
sql
css
linux
ruby-on-rails
windows
c
sql-server
ruby
wpf
xml
ajax
database
regex
windows-7
aspnet-mvc
xcode
django
osx
arrays
vbnet
eclipse
json
facebook
ruby-on-rails-3
ubuntu
performance
networking
string
multithreading
winforms
security
aspnet-mvc-3
visual-studio-2010
bash
homework


In [41]:
predictions = classifier_array[2].transform(test_df)
predictions.take(5)

train_df.groupBy('php').count().show()

+---+-----+
|php|count|
+---+-----+
|  0| 3308|
|  1|  223|
+---+-----+

