In [94]:
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import desc
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, udf
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType

from csv import reader

In [47]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

<Font size=5 color=red>Investigate the original dataset (obviously, it cannot be used). Take a look at https://stackoverflow.com/questions/13793529/r-error-invalid-type-list-for-variable to see how useless the Body column information could be!

The point here is that the body information consists mostly of codes and some weird patterns that are not useful for our purpose. The most important information here is the connection between the title of the questions and tags. So, I removed the Body column from the dataset.</Font>

In [3]:
try:
    spark = init_spark()

    filename1 = "./Train.csv"
    df2 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename1, header=True)
    print(df2.count())
    print(df2.show(10))    
except:
    pass

<Font size=5 color=red>For removing the Body column, I read all the dataset once using Pandas library. After that, I removed the column and got an export to have a concrete file as our dataset. This part has been ommited from the notebook.</Font>

In [4]:
spark = init_spark()

filename = "./TrainWithoutBody.csv"
df1 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename, header=True)
df1 = df1.drop("_c0")
df1 = df1.dropna()

rddTags = df1.select("Tags").rdd

df1.count()

In [5]:
df1.show(5)

+---+--------------------+--------------------+
| Id|               Title|                Tags|
+---+--------------------+--------------------+
|  1|How to check if a...|php image-process...|
|  2|How can I prevent...|             firefox|
|  3|R Error Invalid t...|r matlab machine-...|
|  4|How do I replace ...|     c# url encoding|
|  5|How to modify who...|php api file-get-...|
+---+--------------------+--------------------+
only showing top 5 rows



<Font size = 5, color=green>Finding the 100 most used tags (one DT per each most used tag)

In [6]:
splittedTags = rddTags.filter(lambda r: r[0] != None).flatMap(lambda r: r[0].split(" ")).map(lambda r: (r, 1)).reduceByKey(lambda x, y: x + y)

splittedTags = splittedTags.sortBy(lambda r: r[1], False) #Sorted with number of usage (you can collect and see)

splittedTagsSorted = splittedTags.map(lambda r: r[0]) #Delete this line if you want to see number of times they have been used.

# df10 = anSorted.toDF()


mostUsedTags = splittedTagsSorted.collect()[0:100]



In [7]:
mostUsedTags

['c#',
 'java',
 'php',
 'javascript',
 'android',
 'jquery',
 'c++',
 'python',
 'iphone',
 'asp.net',
 'mysql',
 'html',
 '.net',
 'ios',
 'objective-c',
 'sql',
 'css',
 'linux',
 'ruby-on-rails',
 'windows',
 'c',
 'sql-server',
 'ruby',
 'wpf',
 'xml',
 'ajax',
 'database',
 'regex',
 'windows-7',
 'asp.net-mvc',
 'xcode',
 'django',
 'osx',
 'arrays',
 'vb.net',
 'eclipse',
 'json',
 'facebook',
 'ruby-on-rails-3',
 'ubuntu',
 'performance',
 'networking',
 'string',
 'multithreading',
 'winforms',
 'security',
 'asp.net-mvc-3',
 'visual-studio-2010',
 'bash',
 'homework',
 'image',
 'wcf',
 'html5',
 'wordpress',
 'web-services',
 'visual-studio',
 'forms',
 'algorithm',
 'sql-server-2008',
 'linq',
 'oracle',
 'git',
 'query',
 'perl',
 'apache2',
 'flash',
 'actionscript-3',
 'ipad',
 'spring',
 'apache',
 'silverlight',
 'email',
 'r',
 'cocoa-touch',
 'cocoa',
 'swing',
 'hibernate',
 'excel',
 'entity-framework',
 'file',
 'shell',
 'flex',
 'api',
 'list',
 'internet-explo

In [8]:
print(rddTags.count())

6017243


In [9]:
rddTags.take(10)

[Row(Tags='php image-processing file-upload upload mime-types'),
 Row(Tags='firefox'),
 Row(Tags='r matlab machine-learning'),
 Row(Tags='c# url encoding'),
 Row(Tags='php api file-get-contents'),
 Row(Tags='proxy active-directory jmeter'),
 Row(Tags='core-plot'),
 Row(Tags='c# asp.net windows-phone-7'),
 Row(Tags='.net javascript code-generation'),
 Row(Tags='sql variables parameters procedure calls')]

<Font size=5.5, color="purpule">Here, I have cleaned the Tags column to only contain the most used tags. For example, I ommited the "upload" tag from first group of tags for the first question, because it's not a most used tag.

In [10]:
def replaceNoneWithString(x):
    if (x == None): return "None"
    else : return x

# Subject titles to TF-IDF

In [16]:
tokenizer = Tokenizer(inputCol="Title", outputCol="transformed_tfidf")
wordsData = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="transformed_tfidf", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

featurizedData.take(1)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [100]:
rescaledData.select("id", "title", "features").show()

+---+--------------------+--------------------+
| id|               title|            features|
+---+--------------------+--------------------+
|  1|How to check if a...|(20,[1,3,4,8,9,10...|
|  2|How can I prevent...|(20,[1,2,3,7,12,1...|
|  3|R Error Invalid t...|(20,[2,4,5,6,7,17...|
|  4|How do I replace ...|(20,[3,6,7,10,13,...|
|  5|How to modify who...|(20,[3,5,8,13,15,...|
|  6|setting proxy in ...|(20,[0,3,4,7,13,1...|
|  7|How to draw barpl...|(20,[3,6,7,8,10,1...|
|  8|How to fetch an X...|(20,[0,3,7,8,10,1...|
|  9|.NET library for ...|(20,[0,1,3,4],[1....|
| 10|SQL Server : proc...|(20,[0,3,6,12,16,...|
| 11|How do commercial...|(20,[0,3,6,8,11,1...|
| 12|Crappy Random Num...|(20,[0,1,5,19],[1...|
| 13|Migrate from Mdae...|(20,[1,3,8,18,19]...|
| 14|Where can I find ...|(20,[10,13,16,17,...|
| 15|Can I stop window...|(20,[1,2,12,13,15...|
| 16|PHP framework URL...|(20,[2,4,10,18],[...|
| 17|"What creates ""....|(20,[3,5,8,13,15,...|
| 18|WPF: multiple con...|(20,[0,8,12,13

## Clean up tags to include only most-used tags

In [101]:
def cleanUpTags(tags):
    tags = tags.split(" ")
    tags = [tag for tag in tags if tag in mostUsedTags]
    return tags

cleanUpTagsUDF = udf(cleanUpTags, ArrayType(StringType()))


df = rescaledData.withColumn("cleantags",  cleanUpTagsUDF(col("tags")))

In [106]:
def filterEmptyRows(tags):
    return len(tags)

filterEmptyRowsUDF = udf(filterEmptyRows, IntegerType())

df = df.filter(filterEmptyRowsUDF(col("cleantags")) > 0)
df.select("id", "title", "features", "cleantags").show()
df.count()

+---+--------------------+--------------------+--------------------+
| id|               title|            features|           cleantags|
+---+--------------------+--------------------+--------------------+
|  1|How to check if a...|(20,[1,3,4,8,9,10...|               [php]|
|  2|How can I prevent...|(20,[1,2,3,7,12,1...|           [firefox]|
|  3|R Error Invalid t...|(20,[2,4,5,6,7,17...|                 [r]|
|  4|How do I replace ...|(20,[3,6,7,10,13,...|                [c#]|
|  5|How to modify who...|(20,[3,5,8,13,15,...|          [php, api]|
|  8|How to fetch an X...|(20,[0,3,7,8,10,1...|       [c#, asp.net]|
|  9|.NET library for ...|(20,[0,1,3,4],[1....|  [.net, javascript]|
| 10|SQL Server : proc...|(20,[0,3,6,12,16,...|               [sql]|
| 11|How do commercial...|(20,[0,3,6,8,11,1...|              [.net]|
| 12|Crappy Random Num...|(20,[0,1,5,19],[1...|         [algorithm]|
| 15|Can I stop window...|(20,[1,2,12,13,15...|         [windows-7]|
| 16|PHP framework URL...|(20,[2,4

4522982

# Split data to test, train and validation sets

In [110]:
# TODO
train_data, testVal_data = df.randomSplit([.7,.3],seed=1234)
test_data, validation_data = testVal_data.randomSplit([.5, .5], seed=1234)

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     Id|               Title|                Tags|   transformed_tfidf|         rawFeatures|            features|           cleantags|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 100000|strange behaviour...| c# iphone monotouch|[strange, behavio...|(20,[3,7,8,9,10],...|(20,[3,7,8,9,10],...|        [c#, iphone]|
|1000018|iOS: Get the real...|ios objective-c u...|[ios:, get, the, ...|(20,[1,6,8,9,11,1...|(20,[1,6,8,9,11,1...|  [ios, objective-c]|
|1000022|Find a file with ...|         c# filepath|[find, a, file, w...|(20,[0,3,7,8,10,1...|(20,[0,3,7,8,10,1...|                [c#]|
|1000024|Unable to figure ...|objective-c xcode...|[unable, to, figu...|(20,[3,5,7,8,10,1...|(20,[3,5,7,8,10,1...|[objective-c, xco...|
|1000037|Code improvement:...|  c# design-patter

# Train DecisionTreeClassifiers

# Subject titles to Word2Vec

In [None]:
# use df as your dataframe and add your column to it 