In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer
from pyspark.sql.functions import length, size
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
import nltk

In [2]:
##dataframe = sqlContext.read.format('csv').options(header='true', inferScheme='true').load('C:/Users/jazmi/OneDrive/Documents/DataMate/db/all.csv')
df = spark.table("all_csv")

df.columns

In [3]:
df = df.select('username', 'about_me_text_clean', 'Gender').na.drop()
df_length = df.withColumn('length', length(df['about_me_text_clean']))
df_length.show()

In [4]:
male_female_to_num = StringIndexer(inputCol='Gender', outputCol='label')

tokened_df = Tokenizer(inputCol="about_me_text_clean", outputCol="token_text")

stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')

hashing = HashingTF(inputCol='token_text', outputCol="hashedValues")

idf = IDF(minDocFreq = 10, inputCol="hashedValues", outputCol="idf_features")


In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_features', 'length'], outputCol='features')

In [6]:
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[male_female_to_num, tokened_df, stop_remove, hashing, idf, clean_up])

In [7]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df_length)
cleaned = cleaner.transform(df_length)

In [8]:
# Show label and resulting features
cleaned.select(['label', 'features']).show()

In [9]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [10]:
test_results = predictor.transform(testing)
test_results.show(5)

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)