In [124]:
import numpy as np
import pandas as pd
import nltk
import re
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 

In [125]:
filename = 'Data/mergedDataSet.csv' #The complete file
df = pd.read_csv(filename).iloc[0:10000]

In [126]:
stop = open('stop_words.txt','r').read().split()
def RemoveStopWords(row):
    row = row.lower().split() # converting to lower case and splitting
    str1 = ''
    for item in row:
        if item not in stop: #removing stop words
            item = re.sub(r'[^\w\s]','',item) #removing punctutions
            str1 += (item + ' ')
    return str1
target = df['merged_rating'].values #Saving the target variable as a numpy array
df = df['comment_text'].apply(RemoveStopWords)

In [128]:
commentsFileName = 'Data/comments.csv'

df.to_csv(commentsFileName,index=False) # Writing the comments to a CSV file to be read by TaggedLineDocument next
documents = TaggedLineDocument(commentsFileName) # Tags each sentence (0,1,2,...)

In [129]:
modelSize = 500 # Will represent each comment with a vector of size 500
modelWindow = 8 
model = Doc2Vec(documents, vector_size=modelSize, window=modelWindow, min_count=1, workers=4)

In [130]:
#Creating a numpy array to keep the data and to be used by a machine learning model as the feature vector
data = np.zeros((len(model.docvecs),len(model.docvecs[0])))
for i in range(len(model.docvecs)):
    data[i]=model.docvecs[i]

In [131]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(data, target)

dfData = pd.DataFrame(data=X_res,dtype=float)
dfData['label']=y_res
dfData.to_csv('Data/FeaturizedData.csv')

In [None]:
# LET THE PYSPARK BEGIN!!!

In [109]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
import ast
from pyspark.ml import Pipeline
from pyspark.sql import functions as F, types
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [51]:
# schema = types.StructType([
#     types.StructField("features", types.StringType(), True),
#     types.StructField("label", types.IntegerType(), True),

# ])

In [139]:
df = spark.read.csv('Data/FeaturizedData.csv',header=True)
column_names = df.schema.names[1:]

df.schema
df = df.select(*(F.col(c).cast("float").alias(c) for c in column_names))
df = df.withColumn("label", df["label"].cast(types.IntegerType()))
# df.schema


In [140]:
vec_assemb = VectorAssembler(inputCols=column_names, outputCol="Vecfeatures")
vec_assemb.transform(df)#.show().Vecfeatures

(trainingData, testingData) = df.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="label", featuresCol="Vecfeatures", numTrees=10)
pipeline = Pipeline(stages=[vec_assemb, rf])
ML_model = pipeline.fit(trainingData)
predictions = ML_model.transform(testingData)

#df.schema.names#[0]
# def convertTolist(vec):
#     return vec


In [141]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
accuracy = evaluator.evaluate(predictions)

In [142]:
print(accuracy*100)
# predictions.schema.names

83.09411914136923


In [108]:
predictions.select("prediction").write.option("sep", ",").save("predictions", format="csv", mode="overwrite")


In [166]:
#sentence = "All of my edits are good. Cunts like you who revert good edits because you're too stupid to understand how to write well , and then revert other edits just because you've decided to bear a playground grudge, are the problem.  Maybe one day you'll realise the damage you did to a noble project.  201.215.187.159"
#sentence = "fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck fuck"
sentence = "you fucking asshole"
#sentence = "sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry sweet starwberry"
test = model.infer_vector(RemoveStopWords(sentence)).reshape(1,modelSize)
pandas_df = pd.DataFrame(test)
pandas_df['label'] = np.array([0])

In [167]:
spark_df = spark.createDataFrame(pandas_df)
sample_pred = ML_model.transform(spark_df)


In [168]:
sample_pred.select('prediction').show()

+----------+
|prediction|
+----------+
|       2.0|
+----------+

