In [80]:
import csv
import os
import sys
# Spark imports
import pyspark
from pyspark.rdd import RDD
from pyspark.sql import DataFrame, Row
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import desc
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, expr, concat
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from collections import defaultdict

from csv import reader

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [6]:
spark = init_spark()
filename = "./TrainWithoutBody.csv"
df1 = spark.read.option("multiLine", 'true').option("escape","\'").csv(filename, header=True)
df1 = df1.drop("_c0")
df1 = df1.dropna()
rddTags = df1.select("Tags").rdd

df1.count()

6017243

In [7]:
splittedTags = rddTags.filter(lambda r: r[0] != None).flatMap(lambda r: r[0].split(" ")).map(lambda r: (r, 1)).reduceByKey(lambda x, y: x + y)
splittedTags = splittedTags.sortBy(lambda r: r[1], False) #Sorted with number of usage (you can collect and see)
splittedTagsSorted = splittedTags.map(lambda r: r[0]) #Delete this line if you want to see number of times they have been used.
mostUsedTags = splittedTagsSorted.collect()[0:50]



# Word2Vec With Title

## Utility Functions and Constants

In [68]:
num_labels = 50
classifier_array_title = []

def column_splitter(r):
    if not r.Tags:
        label_array = None
    else:
        tags = r.Tags.split(' ')
        label_array = []
        for t in mostUsedTags:
            if t in tags:
                label_array.append(1)
            else:
                label_array.append(0)
        if 1 not in label_array:
            label_array = None
    
    return (r.Id, r.Title, r.Tags, r.tokenized_text, label_array)

def init_spark_2():
    spark = SparkSession \
        .builder.appName("W2V with Title").config("spark.sql.broadcastTimeout", "72000").config('spark.shuffle.service.enabled', 'TRUE').config("spark.debug.maxToStringFields" , "100").config("spark.executor.heartbeatInterval", "14400").getOrCreate()
        
    return spark

## Data Engineering

### Load Data

In [69]:
spark = init_spark_2()

try:
    filename1 = "./TrainWithoutBody.csv"
    w2v_data_title = spark.read.option("multiLine", 'true').option("escape","\'").option("escape","\"").option("mode", "DROPMALFORMED").csv(filename1, header=True)
except:
    pass

w2v_data_title.take(1)    

[Row(_c0='0', Id='1', Title='How to check if an uploaded file is an image without mime type?', Tags='php image-processing file-upload upload mime-types')]

### Data Featurization, Sampling and Structure

In [None]:

tokenizer_title = Tokenizer(inputCol="Title", outputCol="tokenized_text")
tokenized_df_title = tokenizer_title.transform(w2v_data_title)

tokenized_df_title = tokenized_df_title.rdd.map(lambda r: column_splitter(r))
tokenized_df_title = tokenized_df_title.toDF(['Id', 'Title', 'Tags', 'tokenized_text', 'tag_array'])
tokenized_df_title = tokenized_df_title.drop(col('Title'))

tokenized_df_title = tokenized_df_title.dropna()

sampled_df_title = w2v_data_title.sample(False, 0.00083, seed=42)

train_df_pre_title, test_val_df_title = sampled_df_title.randomSplit([.7,.3],seed=1234)
test_df_pre_title, val_df_pre_title = test_val_df_title.randomSplit([.5, .5], seed=1234)

word2Vec = Word2Vec(inputCol="tokenized_text", outputCol="features", vectorSize=100)
fitted_word2Vec = word2Vec.fit(tokenized_df_title)

train_df_pre_title = fitted_word2Vec.transform(train_df_pre_title)
test_df_pre_title = fitted_word2Vec.transform(test_df_pre_title)
val_df_pre_title = fitted_word2Vec.transform(val_df_pre_title)

train_df_pre_title = train_df_pre_title.drop(col('tokenized_text'))

In [71]:
train_df_pre_title.show()

### Seperate Label Array into Columns

In [78]:
train_df_title = train_df_pre_title.select(['Id']+['Tags']+['features']+[expr('tag_array[' + str(x) + ']') for x in range(0, num_labels)])

for i in range(0, num_labels): 
    if '.' in mostUsedTags[i]:
        mostUsedTags[i] = mostUsedTags[i].replace('.', '')

colnames = ['Id']+['Tags']+['features'] + [str(mostUsedTags[i]) for i in range(0, num_labels)] 
train_df_title = train_df_title.toDF(*colnames)

train_df_title.take(1)

[Row(Id='1010021', Tags='objective-c ios asihttprequest mbprogresshud', features=DenseVector([0.1334, 0.1328, -0.0102, -0.0934, 0.0128, 0.0443, -0.0522, 0.0271, 0.0243, -0.0651, 0.0147, 0.096, -0.051, 0.0013, 0.0946, 0.002, 0.044, 0.0692, -0.0612, 0.0684, -0.185, -0.1426, 0.0978, -0.0974, 0.1009, 0.03, 0.0494, -0.1662, -0.1199, 0.0205, 0.0946, -0.0526, -0.0228, -0.0556, 0.016, 0.0649, 0.0587, -0.0064, 0.0571, 0.2185, -0.0653, 0.0316, -0.0888, -0.0019, -0.0207, -0.028, 0.0238, 0.0005, -0.0432, -0.0198, -0.0421, 0.0432, -0.1001, -0.0267, 0.1116, 0.0725, 0.1641, 0.0376, 0.0387, -0.0122, -0.0367, -0.1195, 0.1086, 0.0158, -0.0719, 0.1175, 0.1415, 0.076, -0.0416, -0.0258, 0.1169, 0.0997, 0.1299, -0.0532, -0.0107, 0.0317, -0.0595, -0.0329, 0.1416, -0.0224, -0.0705, -0.1555, -0.0044, -0.0675, -0.0367, -0.0667, -0.0863, -0.0146, -0.0121, -0.041, -0.0242, -0.1583, 0.0526, 0.0351, -0.1502, -0.0184, 0.0217, -0.0305, 0.0506, 0.0151]), c#=0, java=0, php=0, javascript=0, android=0, jquery=0, c++=0, p

## Train Models

In [None]:
w2v_lrs = defaultdict()

w2v_dts = defaultdict()

In [85]:
for tag in mostUsedTags:
    if tag not in w2v_lrs:
        w2v_lrs[tag] = LogisticRegression(maxIter=100, featuresCol='features', labelCol=tag, predictionCol='prediction').fit(train_df_title)
        print(tag)

security
aspnet-mvc-3
visual-studio-2010
bash
homework


In [None]:
for tag in mostUsedTags:
    if tag not in w2v_dts:
        w2v_dts[tag] = DecisionTreeClassifier(maxDepth=4, featuresCol='features', labelCol=tag, predictionCol='prediction').fit(train_df_title)
        print(tag)

# Word2Vec With Body

### Utility Functions and Constants

In [None]:
num_labels = 50
classifier_array = []
def column_splitter(r):
    if not r.Tags:
        label_array = None
    else:
        tags = r.Tags.split(' ')
        label_array = []
        for t in mostUsedTags:
            if t in tags:
                label_array.append(1)
            else:
                label_array.append(0)
        if 1 not in label_array:
            label_array = None
    
    return (r.Id, r.Title, r.Tags, r.tokenized_text, label_array)

def init_spark_3():
    spark = SparkSession \
        .builder.appName("Python Spark SQL basic example").config("spark.sql.broadcastTimeout", "72000").config('spark.shuffle.service.enabled', 'TRUE').config("spark.debug.maxToStringFields" , "100").config("spark.executor.heartbeatInterval", "14400").getOrCreate()
        
    return spark

### Load Data

In [None]:
spark = init_spark_3()

try:

    filename1 = "./Train.csv"
    w2v_data = spark.read.option("multiLine", 'true').option("escape","\'").option("escape","\"").option("mode", "DROPMALFORMED").csv(filename1, header=True)
except:
    pass

w2v_data = w2v_data.withColumn('Title', concat(col('Title'), col('Body')))
w2v_data = w2v_data.drop(col('Body'))

w2v_data.take(1)

# Another example
    

[Row(Id='1', Title="How to check if an uploaded file is an image without mime type?<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Tags='php image-processing file-upload upload mime-types')]

### Data Featurization, Sampling and Structure

In [None]:
sampled_df = w2v_data.sample(False, 0.00083, seed=42)

tokenizer = Tokenizer(inputCol="Title", outputCol="tokenized_text")
tokenized_df = tokenizer.transform(sampled_df)

tokenized_df = tokenized_df.rdd.map(lambda r: column_splitter(r))
tokenized_df = tokenized_df.toDF(['Id', 'Title', 'Tags', 'tokenized_text', 'tag_array'])
tokenized_df = tokenized_df.drop(col('Title'))

tokenized_df = tokenized_df.dropna()

tokenized_df.show()


+-----+--------------------+--------------------+--------------------+--------------------+
|   Id|               Title|                Tags|      tokenized_text|           tag_array|
+-----+--------------------+--------------------+--------------------+--------------------+
| 7283|C# timers for mov...|c# animation time...|[c#, timers, for,...|[1, 0, 0, 0, 0, 0...|
| 7396|I want to call Ht...|c# .net asp.net-m...|[i, want, to, cal...|[1, 0, 0, 0, 0, 0...|
| 8282|Sending email wit...|  android oauth smtp|[sending, email, ...|[0, 0, 0, 0, 1, 0...|
| 8977|How to monitor wh...|          c# windows|[how, to, monitor...|[1, 0, 0, 0, 0, 0...|
| 9601|unity mesh collid...|        unity3d mesh|[unity, mesh, col...|[0, 0, 0, 0, 0, 0...|
|10851|Is there any sett...|ruby-on-rails uni...|[is, there, any, ...|[0, 0, 0, 0, 0, 0...|
|11099|Deep JSON Seriali...|java json jpa ser...|[deep, json, seri...|[0, 1, 0, 0, 0, 0...|
|11638|Getting an inters...|google-places-api...|[getting, an, int...|[0, 0, 0, 

In [None]:
train_df_pre, test_val_df = tokenized_df.randomSplit([.7,.3],seed=1234)
test_df_pre, val_df_pre = test_val_df.randomSplit([.5, .5], seed=1234)

word2Vec = Word2Vec(inputCol="tokenized_text", outputCol="features", vectorSize=100)
fitted_word2Vec = word2Vec.fit(tokenized_df)

train_df_pre = fitted_word2Vec.transform(train_df_pre)
test_df_pre = fitted_word2Vec.transform(test_df_pre)
val_df_pre = fitted_word2Vec.transform(val_df_pre)

train_df_pre = train_df_pre.drop(col('tokenized_text'))

In [None]:

train_df_pre.show()

+-------+--------------------+--------------------+--------------------+--------------------+
|     Id|               Title|                Tags|           tag_array|            features|
+-------+--------------------+--------------------+--------------------+--------------------+
|1005960|When evaluating a...|                   c|[0, 0, 0, 0, 0, 0...|[-0.0177763070276...|
|1006700|Delete files in t...|      windows folder|[0, 0, 0, 0, 0, 0...|[-0.0656386298768...|
| 100767|Reference existin...|c# entity-framewo...|[1, 0, 0, 0, 0, 0...|[0.09648928268901...|
|1007785|How to search (us...|  regex vim escaping|[0, 0, 0, 0, 0, 0...|[-0.1014933573348...|
|1008518|Cucumber and vari...|   ruby tdd cucumber|[0, 0, 0, 0, 0, 0...|[0.02025146771946...|
|1010686|How to get common...|                 sql|[0, 0, 0, 0, 0, 0...|[-0.0140362187506...|
|1011820|joining two table...|                tsql|[0, 0, 0, 0, 0, 0...|[0.07489425870370...|
|1012944|console.dir(windo...|javascript firefo...|[0, 0, 0,

### Seperate Labels into Columns

In [None]:
train_df = train_df_pre.select(['Id']+['Title']+['Tags']+['features']+[expr('tag_array[' + str(x) + ']') for x in range(0, num_labels)])

for i in range(0, num_labels): 
    if '.' in mostUsedTags[i]:
        mostUsedTags[i] = mostUsedTags[i].replace('.', '')

colnames = ['Id']+['Title']+['Tags']+['features'] + [str(mostUsedTags[i]) for i in range(0, num_labels)] 
train_df = train_df.toDF(*colnames)

train_df.take(1)

In [None]:
w2v_lrs_body = defaultdict()

w2v_dts_body = defaultdict()

In [None]:
for tag in mostUsedTags:
    if tag not in w2v_lrs_body:
        w2v_lrs_body[tag] = LogisticRegression(maxIter=100, featuresCol='features', labelCol=tag, predictionCol='prediction').fit(train_df)
        print(tag)

In [None]:
for tag in mostUsedTags:
    if tag not in w2v_dts_body:
        w2v_dts_body[tag] = DecisionTreeClassifier(maxDepth=4, featuresCol='features', labelCol=tag, predictionCol='prediction').fit(train_df)
        print(tag)