In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from datetime import datetime
import math
import time
import re
import nltk 
from pyspark.sql.functions import lit
from pyspark.ml.linalg import SparseVector
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from nltk.corpus import stopwords as sw

stopwords = sw.words('english') + ['xxxx' , 'xxx' ,'' , 'xx' , 'x']

spark = SparkSession.builder.appName("tfidf").getOrCreate()
from pyspark.sql.functions import avg
sc = spark.sparkContext


def filter_data(x):
    values = x.split(",")
    if len(values) == 3:
        date = values[0]
        label = values[1]
        comment = values[2]
        if (len(comment) != 0):
            return (date.startswith("201"))  
        else:
            return False
    else:
        return False
    

def parse_data(x):
    values = x.split(",")
    label = values[1]
    comment = re.sub('[^a-zA-Z]+', ' ', values[2]).lower().strip()
    return label,comment


data = sc.textFile("customer_complaints.csv").filter(filter_data).map(parse_data)
regs_number = data.count()

for i in data.take(2):              
    print(i)
    print("\n")
        

lexicon_size = 50
unique_words = data.flatMap(lambda x: x[1].split(" ")).\
                          filter(lambda x: x not in stopwords).\
                          map(lambda x : (x,1)).\
                          reduceByKey(lambda x,y : x+y ).\
                          sortBy(lambda x: x[1],ascending=False).\
                          map(lambda x : x[0]).\
                          take(lexicon_size)

uwords = sc.broadcast(unique_words)

#customer complaints
cc = data.map(lambda x: (x[0],x[1].split(" "))).\
            map(lambda x : (x[0], [y for y in x[1] if y in uwords.value])).\
            filter(lambda x: len(x[1]) !=0).\
            zipWithIndex()
            #((string_label,list_of_words),sentence_index)
    
for i in cc.take(2):
    print(i ,'\n')

#Ft,d (we'll calculate the double normalized TF later)
tf = cc.flatMap(lambda x : [((y,x[0][0],x[1]),1) for y in x[0][1]]).\
            reduceByKey(lambda x,y : x + y).\
            map(lambda x : (x[0][0],(x[0][1],x[0][2],x[1])))
            #((word,label,sentence_index),Ft,d) before map
            #(word,(label,sentence_index,Ft,d)) after map -> final form
    
for i in tf.take(2):
    print(i,'\n')

#IDF
idf = cc.flatMap(lambda x: [(y,1) for y in set(x[0][1])]).\
              reduceByKey(lambda x ,y: x+y).\
              sortBy(lambda x: x[1],ascending=False).\
              map(lambda x : (x[0],math.log(regs_number/x[1])))

for i in idf.take(5):
    print(i,"\n")
    
##tfidf rdd
tfidf = tf.join(idf)

for i in tfidf.take(3):
    print(i,"\n")

('Debt collection', 'transworld systems inc is trying to collect a debt that is not mine not owed and is inaccurate')


('Credit reporting credit repair services or other personal consumer reports', 'i would like to request the suppression of the following items from my credit report which are the result of my falling victim to identity theft this information does not relate to transactions that i have made accounts that i have opened as the attached supporting documentation can attest as such it should be blocked from appearing on my credit report pursuant to section b of the fair credit reporting act')


(('Debt collection', ['debt']), 0) 

(('Credit reporting credit repair services or other personal consumer reports', ['would', 'credit', 'report', 'information', 'made', 'accounts', 'credit', 'report', 'credit', 'reporting']), 1) 

('reporting', ('Credit reporting credit repair services or other personal consumer reports', 1, 1)) 

('told', ('Debt collection', 2, 1)) 

('credit', 0.6

__final computation__

In [2]:
res = tfidf.map(lambda x: (uwords.value.index(x[0]),x[1])).\
            map(lambda x: ((x[1][0][1],x[1][0][0]),(x[0],x[1][0][2],x[1][1]))).\
            map(lambda x: (x[0],([x[1][0]],[x[1][1]],[x[1][2]]))).\
            reduceByKey(lambda x,y: (x[0] + y[0],x[1] + y[1],x[2]+y[2])).\
            map(lambda x: (x[0],(x[1][0],[0.5 + 0.5*y/max(x[1][1]) for y in x[1][1]],x[1][2]))).\
            map(lambda x: (x[0][1],lexicon_size,[(x[1][0][i],x[1][1][i]*x[1][2][i]) for i in range(len(x[1][0]))])).\
            map(lambda x: (x[0],x[1],sorted(x[2],key=lambda y:y[0]))).\
            map(lambda x: (x[0],x[1],[y[0] for y in x[2]],[k[1] for k in x[2]]))
            
            

for i in res.take(5):
    print(i,"\n")

('Mortgage', 50, [4, 5, 9, 10, 11, 14, 16, 20, 25, 26, 29, 34, 35, 36, 39, 41, 49], [1.5173549646931856, 0.7777333284253185, 0.8544062422352126, 0.7646471900037972, 0.7715629477496364, 0.7783573388778917, 0.9761484356488878, 1.050871547063804, 1.4970595167873013, 0.9930948495997832, 0.93266869202192, 1.068075278024588, 1.015104253003259, 1.008469042402106, 1.3261363149177803, 1.1275624362358958, 1.059563190204051]) 

('Mortgage', 50, [5, 10, 14, 16, 17, 23, 24, 25, 34, 47, 49], [0.960729405701864, 1.070506066005316, 0.9080835620242069, 1.138839841590369, 0.9749312178656413, 1.041237298125056, 1.0037106771629942, 2.2061929721076017, 1.1682073353393931, 1.272538879183559, 1.2361570552380596]) 

('Debt collection', 50, [3, 6, 8, 14, 23, 25, 27, 42, 46, 49], [1.1730611156700517, 0.9586568884298988, 1.0313902292260002, 0.8717602195432387, 1.1661857739000625, 1.323715783264561, 1.105093611128405, 1.088378676095442, 2.1698365566224522, 1.1867107730285371]) 

('Credit reporting credit repair s

### sparse vector 

In [3]:
sv = res.map(lambda x :(x[0],SparseVector(lexicon_size,x[2],x[3])))
for i in sv.take(2):
    print(i)

('Mortgage', SparseVector(50, {4: 1.5174, 5: 0.7777, 9: 0.8544, 10: 0.7646, 11: 0.7716, 14: 0.7784, 16: 0.9761, 20: 1.0509, 25: 1.4971, 26: 0.9931, 29: 0.9327, 34: 1.0681, 35: 1.0151, 36: 1.0085, 39: 1.3261, 41: 1.1276, 49: 1.0596}))
('Mortgage', SparseVector(50, {5: 0.9607, 10: 1.0705, 14: 0.9081, 16: 1.1388, 17: 0.9749, 23: 1.0412, 24: 1.0037, 25: 2.2062, 34: 1.1682, 47: 1.2725, 49: 1.2362}))


### dataframe + stratified split

In [4]:
cc_df = sv.toDF(["label_f","features"])

In [5]:
stringIndexer = StringIndexer(inputCol="label_f",outputCol = "label")
stringIndexer.setHandleInvalid("skip")
stringIndexerModel = stringIndexer.fit(cc_df)
cc_df = stringIndexerModel.transform(cc_df)

In [6]:
cc_df.show(5)

+--------------------+--------------------+-----+
|             label_f|            features|label|
+--------------------+--------------------+-----+
|            Mortgage|(50,[4,5,9,10,11,...|  2.0|
|            Mortgage|(50,[5,10,14,16,1...|  2.0|
|     Debt collection|(50,[3,6,8,14,23,...|  1.0|
|Credit reporting ...|(50,[1,2,3,10,17,...|  0.0|
|            Mortgage|(50,[4,5,6,10,13,...|  2.0|
+--------------------+--------------------+-----+
only showing top 5 rows



In [8]:
seed= 10

#https://sparkbyexamples.com/spark/using-lit-and-typedlit-to-add-a-literal-or-constant-to-spark-dataframe/
fractions = cc_df.select("label").distinct().withColumn("fraction", lit(0.8)).rdd.collectAsMap()
print(fractions)  

#https://spark.apache.org/docs/latest/api/R/sampleBy.html
sampled_df = cc_df.stat.sampleBy("label", fractions, seed)
sampled_df.show()

{8.0: 0.8, 0.0: 0.8, 7.0: 0.8, 1.0: 0.8, 4.0: 0.8, 11.0: 0.8, 14.0: 0.8, 3.0: 0.8, 2.0: 0.8, 17.0: 0.8, 10.0: 0.8, 13.0: 0.8, 6.0: 0.8, 5.0: 0.8, 15.0: 0.8, 9.0: 0.8, 16.0: 0.8, 12.0: 0.8}
+--------------------+--------------------+-----+
|             label_f|            features|label|
+--------------------+--------------------+-----+
|            Mortgage|(50,[4,5,9,10,11,...|  2.0|
|            Mortgage|(50,[5,10,14,16,1...|  2.0|
|Credit reporting ...|(50,[1,2,3,10,17,...|  0.0|
|            Mortgage|(50,[4,5,6,10,13,...|  2.0|
|Credit reporting ...|(50,[0,2,6,24,31,...|  0.0|
|     Debt collection|(50,[13,33,49],[1...|  1.0|
|Credit reporting ...|(50,[0,1,2,16,29,...|  0.0|
|Credit reporting ...|(50,[0,2,7,11,15,...|  0.0|
|Checking or savin...|(50,[1,2,3,5,8,9,...|  6.0|
|            Mortgage|(50,[1,4,5,6,11,1...|  2.0|
|     Debt collection|(50,[0,1,4,6,7,10...|  1.0|
|Credit reporting ...|(50,[4,7,14,17,18...|  0.0|
|Credit reporting ...|(50,[0,1,2,5,9,13...|  0.0|
|Credit car

__cache trainset and check time difference in training__

In [24]:
#remove duplicates
cc_df = cc_df.dropDuplicates()

train = cc_df.sampleBy("label", fractions=fractions, seed=seed).cache() # to compute times

# Subtracting 'train' from original 'data' to get test set 
#subtract(other, numPartitions=None)[source]
#Return each value in self that is not contained in other

#works when duplicates removed
test = cc_df.subtract(train)

##Using ExceptAll because of having duplicates, since the lexicon is small its not rare to have same features and labels
#test = cc_df.exceptAll(train)

In [25]:
print(train.count()) # train rows
print(test.count()) # test rows

304095
76078


In [27]:
print(cc_df.count())
cc_df.groupby("label").count().show()

380173
+-----+-----+
|label|count|
+-----+-----+
|  8.0|13957|
|  0.0|73716|
|  7.0|17756|
|  1.0|86644|
|  4.0|22945|
| 11.0| 7370|
| 14.0| 1442|
|  3.0|29723|
|  2.0|58387|
| 17.0|   14|
| 10.0| 7831|
| 13.0| 1675|
|  6.0|17935|
|  5.0|23861|
| 15.0| 1396|
|  9.0| 9054|
| 16.0|  284|
| 12.0| 6183|
+-----+-----+



In [28]:
###show rows for each label for train 
train.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  8.0|11155|
|  0.0|59109|
|  7.0|14219|
|  1.0|69065|
|  4.0|18358|
| 11.0| 5924|
| 14.0| 1143|
|  3.0|23787|
|  2.0|46694|
| 17.0|   13|
| 10.0| 6279|
| 13.0| 1347|
|  6.0|14377|
|  5.0|19062|
| 15.0| 1112|
|  9.0| 7274|
| 16.0|  232|
| 12.0| 4945|
+-----+-----+



In [29]:
###show rows for each label for test 
test.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  8.0| 2802|
|  0.0|14607|
|  7.0| 3537|
|  1.0|17579|
|  4.0| 4587|
| 11.0| 1446|
| 14.0|  299|
|  3.0| 5936|
|  2.0|11693|
| 17.0|    1|
| 10.0| 1552|
| 13.0|  328|
|  6.0| 3558|
| 15.0|  284|
|  5.0| 4799|
|  9.0| 1780|
| 16.0|   52|
| 12.0| 1238|
+-----+-----+



In [32]:
train.show(5)

+--------------------+--------------------+-----+
|             label_f|            features|label|
+--------------------+--------------------+-----+
|Bank account or s...|(50,[1,8,11,19,49...|  8.0|
|     Debt collection|(50,[30,31,34,46,...|  1.0|
|Credit card or pr...|(50,[0,8,9,11,15,...|  3.0|
|Credit reporting ...|(50,[0,3,9,11,13,...|  0.0|
|            Mortgage|(50,[4,5,6,9,10,1...|  2.0|
+--------------------+--------------------+-----+
only showing top 5 rows



In [33]:
test.show(5)

+--------------------+--------------------+-----+
|             label_f|            features|label|
+--------------------+--------------------+-----+
|Bank account or s...|(50,[0,1,3,4,5,8,...|  8.0|
|Bank account or s...|(50,[0,1,3,5,8,9,...|  8.0|
|Bank account or s...|(50,[0,1,5,8,11,1...|  8.0|
|Bank account or s...|(50,[0,1,5,8,21,2...|  8.0|
|Bank account or s...|(50,[1,2,5,8,11,1...|  8.0|
+--------------------+--------------------+-----+
only showing top 5 rows



__code fraction another way__


classes = cc_df.groupBy("label").count().count()

dicts = {}

keys = range(classes)

for i in keys:

        dicts[i] = 0.8
        
train = dataDF.sampleBy("label", fractions = dicts)

train = train.cache()

test = dataDF.subtract(train)

### MultiLayerPerceptron

In [35]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [lexicon_size,40,30,20,18]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.5611346249901417
