In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
   .master("local") \
   .appName("Natural Language Processing") \
   .config("spark.executor.memory", "6gb") \
   .getOrCreate()

In [None]:
df = spark.read.format('com.databricks.spark.csv')\
                    .options(header='true', inferschema='true')\
                    .load('TherapyBotSession.csv')


In [None]:
df.show()

In [None]:
df = df.select('id', 'label', 'chat')

In [None]:
df.show()

In [None]:
df.groupBy("label") \
    .count() \
    .orderBy("count", ascending = False) \
    .show()

In [None]:
import pyspark.sql.functions as F
df = df.withColumn('word_count',F.size(F.split(F.col('chat'),' ')))

In [None]:
df.show()

In [None]:
df.groupBy('label')\
    .agg(F.avg('word_count').alias('avg_word_count'))\
    .orderBy('avg_word_count', ascending = False) \
    .show()


In [None]:
df_plot = df.select('id', 'word_count').toPandas()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

df_plot.set_index('id', inplace=True)
df_plot.plot(kind='bar', figsize=(16, 6))
plt.ylabel('Word Count')
plt.title('Word Count distribution')
plt.show()

In [None]:
from textblob import TextBlob
def sentiment_score(chat):
        return TextBlob(chat).sentiment.polarity

In [None]:
from pyspark.sql.types import FloatType
sentiment_score_udf = F.udf(lambda x: sentiment_score(x), FloatType())


In [None]:
df = df.select('id', 'label', 'chat','word_count',
                   sentiment_score_udf('chat').alias('sentiment_score'))
df.show()

In [None]:
df.groupBy('label')\
    .agg(F.avg('sentiment_score').alias('avg_sentiment_score'))\
    .orderBy('avg_sentiment_score', ascending = False) \
    .show()

In [None]:
df = df.withColumn('words',F.split(F.col('chat'),' '))
df.show()

In [None]:
stop_words = ['i','me','my','myself','we','our','ours','ourselves',
              'you','your','yours','yourself','yourselves','he','him',
              'his','himself','she','her','hers','herself','it','its',
              'itself','they','them','their','theirs','themselves',
              'what','which','who','whom','this','that','these','those',
              'am','is','are','was','were','be','been','being','have',
              'has','had','having','do','does','did','doing','a','an',
              'the','and','but','if','or','because','as','until','while',
              'of','at','by','for','with','about','against','between',
              'into','through','during','before','after','above','below',
              'to','from','up','down','in','out','on','off','over','under',
              'again','further','then','once','here','there','when','where',
              'why','how','all','any','both','each','few','more','most',
              'other','some','such','no','nor','not','only','own','same',
              'so','than','too','very','can','will','just','don','should','now']

In [None]:
from pyspark.ml.feature import StopWordsRemover 

In [None]:
stopwordsRemovalFeature = StopWordsRemover(inputCol="words", 
                                           outputCol="words without stop").setStopWords(stop_words)

In [None]:
from pyspark.ml import Pipeline
stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature])
pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(df)

In [None]:
df = pipelineFitRemoveStopWords.transform(df)
df.select('words', 'words without stop').show(5)

In [None]:
label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType())
df = df.withColumn('label', label('label'))

In [None]:
df.select('label').show()

In [None]:
import pyspark.ml.feature as feat
TF_ = feat.HashingTF(inputCol="words without stop", 
                     outputCol="rawFeatures", numFeatures=100000)
IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features")

In [None]:
pipelineTFIDF = Pipeline(stages=[TF_, IDF_])

In [None]:
pipelineFit = pipelineTFIDF.fit(df)
df = pipelineFit.transform(df)

In [None]:
df.select('label', 'rawFeatures','features').show()

In [None]:
(trainingDF, testDF) = df.randomSplit([0.75, 0.25], seed = 1234)

In [None]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression(regParam=0.025)

In [None]:
logregModel = logreg.fit(trainingDF)

In [None]:
predictionDF = logregModel.transform(testDF)

In [None]:
predictionDF.select('label', 'probability', 'prediction').show()

In [None]:
predictionDF.crosstab('label', 'prediction').show()

In [None]:
from sklearn import metrics
actual = predictionDF.select('label').toPandas()
predicted = predictionDF.select('prediction').toPandas()

In [None]:
print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100))

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

scores = predictionDF.select('label', 'rawPrediction')
evaluator = BinaryClassificationEvaluator()
print('The ROC score is {}%'.format(round(evaluator.evaluate(scores),3)*100))

In [None]:
predictionDF.describe('label').show()