# <center>Model Training (v3)</center>

<br>
<br>
<p>First we run the previously developed code.</p>
<br>
<br>

In [1]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ml_tweet_sentiment').getOrCreate()
df = spark.read.csv('training_data.csv', header = True, inferSchema = True)
df.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- text: string (nullable = true)



In [2]:
ts = spark.read.csv('test_data.csv', header = True, inferSchema = True)
ts.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- text: string (nullable = true)



In [3]:
df_train = df.select(df.target.alias("label"), df.text)
df_train.show(5)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0|@switchfoot http:...|
|    0|is upset that he ...|
|    0|@Kenichan I dived...|
|    0|my whole body fee...|
|    0|@nationwideclass ...|
+-----+--------------------+
only showing top 5 rows



In [4]:
df_test = ts.select(ts.target.alias("label"), ts.text)
df_test.show(5)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    1|@stellargirl I lo...|
|    1|Reading my kindle...|
|    1|Ok, first assesme...|
|    1|@kenburbary You'l...|
|    1|@mikefish  Fair e...|
+-----+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import  Tokenizer, HashingTF, IDF


tok = Tokenizer(inputCol="text", outputCol="words")
tf = HashingTF(inputCol="words", outputCol="tf", numFeatures=500)
idf = IDF(inputCol="tf", outputCol="features")


In [None]:
#feat_pipeline = Pipeline(stages=[tok, tf, idf])

#feat_model = feat_pipeline.fit(df)
#features = feat_model.transform(df)

In [None]:
#test_model = feat_pipeline.fit(ts)
#test = test_model.transform(ts)

<br>
<br>
<p>Now we'll train the model.</p>
<br>
<br>

In [6]:
from pyspark.ml.classification import GBTClassifier


gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

In [7]:
from pyspark.ml import Pipeline


pipeline = Pipeline(stages=[tok, tf, idf, gbt])

In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")

In [9]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


paramGrid = ParamGridBuilder() \
    .addGrid(tf.numFeatures, [50, 100]) \
    .addGrid(gbt.maxBins, [2,4]) \
    .addGrid(gbt.maxDepth, [2,4]) \
    .build()


In [10]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=4)

In [11]:
model = crossval.fit(df_train)

In [12]:
prediction = model.transform(df_test)

In [13]:
binEval.evaluate(prediction)

0.6100278551532033