In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer
from pyspark.sql.functions import length, size, lit
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
##dataframe = sqlContext.read.format('csv').options(header='true', inferScheme='true').load('C:/Users/jazmi/OneDrive/Documents/DataMate/db/all.csv')
df = spark.table("all_csv")

df.columns

In [3]:
df = df.select('username', 'about_me_text', 'Gender').na.drop()
df_length = df.withColumn('length', length(df['about_me_text']))

male_female_to_num = StringIndexer(inputCol='Gender', outputCol='label')

df = male_female_to_num.fit(df_length).transform(df_length)
df.show()

In [4]:
# Tokenize data
tokened_df = Tokenizer(inputCol="about_me_text", outputCol="token_text")
tt_df = tokened_df.transform(df)
tt_df = tt_df.where(size(tt_df["token_text"]) > 10)

tt_df.show()


In [5]:
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
removed_frame = stop_remove.transform(tt_df)
removed_frame.show(truncate=False)

In [6]:
# Run the hashing term frequency
hashing = HashingTF(inputCol='token_text', outputCol="hashedValues")

# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.show()

In [7]:
# Fit the IDF on the data set 
idf = IDF(minDocFreq = 10, inputCol="hashedValues", outputCol="idf_features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [8]:
# Display the DataFrame
rescaledData.show()
#rescaledData.select("token_text", "idf_features").show()

In [9]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_features', 'length'], outputCol='features')
cleaned_data = clean_up.transform(rescaledData)

In [10]:
cleaned_data.select(['label', 'idf_features']).show()

In [11]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned_data.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [12]:
test_results = predictor.transform(testing)
test_results.show(5)

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)