<a href="https://colab.research.google.com/github/eliasboughosn/Spark-ML-Algorithms/blob/main/EliasBoughosnSparkProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Uploading data from bucket

In [None]:
!gsutil cp gs://projecteliasbg/* .

Copying gs://projecteliasbg/reviews.csv...
- [1 files][ 16.0 MiB/ 16.0 MiB]                                                
Operation completed over 1 objects/16.0 MiB.                                     


### Copying from local filesystem to the HDFS

In [None]:
!hdfs dfs -put reviews.csv /user/eliasboughosn

### Creating spark session

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [None]:
reviews_on_hdfs = "/user/eliasboughosn"

In [None]:
reviews = spark.read.csv(reviews_on_hdfs).sample(0.1) #reading data from cloud
reviews.show(n=5)

+---+--------------------+---+
|_c0|                 _c1|_c2|
+---+--------------------+---+
|  5|Great course - I ...|  5|
| 27|Great content! Le...|  5|
| 29|I found this cour...|  5|
| 44|A really challeng...|  5|
| 48|I'll start by say...|  3|
+---+--------------------+---+
only showing top 5 rows



In [None]:
reviews = reviews.selectExpr("_c0 as id", "_c1 as comments","_c2 as label") #Renaming columns
reviews.show()
reviews.printSchema()

+---+--------------------+-----+
| id|            comments|label|
+---+--------------------+-----+
|  5|Great course - I ...|    5|
| 27|Great content! Le...|    5|
| 29|I found this cour...|    5|
| 44|A really challeng...|    5|
| 48|I'll start by say...|    3|
| 54|    very interesting|    5|
| 76|Great review of w...|    5|
| 79|A lot of speaking...|    1|
| 86|Great overview an...|    5|
| 87|So great! If you ...|    5|
|108|This course was f...|    5|
|109|Great course. Can...|    5|
|128|Very nice introdu...|    5|
|148|Great content, th...|    5|
|153|An excellent cour...|    5|
|158|        Great Class!|    5|
|161|A good Introducti...|    4|
|178|It was a perfect ...|    5|
|179|"Best place to ge...|    5|
|182|This is a really ...|    5|
+---+--------------------+-----+
only showing top 20 rows

root
 |-- id: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- label: string (nullable = true)



### Importing the libraries we need

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Word2Vec

### Dropping Na rows

In [None]:
reviews.dropna().count()

10719

In [None]:
df = reviews.selectExpr("cast(label as int) label","cast(comments as string) comments","cast(id as string) id")
df.printSchema()

root
 |-- label: integer (nullable = true)
 |-- comments: string (nullable = true)
 |-- id: string (nullable = true)



In [None]:
df.toPandas()['label'].value_counts()

5.0    7784
4.0    1817
3.0     522
1.0     248
2.0     214
Name: label, dtype: int64

In [None]:
df.dropna().count()

10585

In [None]:
df = df[(df["label"]=="1") | (df["label"]=="2") | (df["label"]=="3") | (df["label"]=="4") | (df["label"]=="5")] #making sure we only have these values

In [None]:
print(df.count())
print(df.dropna().count())

10585
10585


In [None]:
df.toPandas()['label'].unique()

array([5, 3, 1, 4, 2])

In [None]:
df=df.toPandas().sort_values(['label'])


In [None]:
df['label'].value_counts()

5    7784
4    1817
3     522
1     248
2     214
Name: label, dtype: int64

### We can observe that the data is imbalanced, and most of the label values are 4 and 5

In [None]:
# df.drop(df.tail(5000).index, 
#         inplace = True)

In [None]:
df['label'].value_counts()

5    7784
4    1817
3     522
1     248
2     214
Name: label, dtype: int64

### choosing a sample of the data

In [None]:
df = df.sample(frac = 1) 

In [None]:
from pyspark.sql import SQLContext #transforming the pandas df to spark df
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(df)

### First approach:
#### i will use a pipeline to preprocess the data and then apply machine learning
### Second approach:
#### i will use a full pipline including machine learning and grid search

### First approach:

### linear regression

In [None]:
data_preparation = Pipeline(stages=[
    Tokenizer(inputCol="comments", outputCol="words"),
    Word2Vec(inputCol="words", outputCol="model")
])

prepared_reviews = data_preparation.fit(df).transform(df)
prepared_reviews.select("comments", "words", "model").show(n=5)

regression = LinearRegression(featuresCol="model", labelCol="label", maxIter=100, regParam=0.2, elasticNetParam=0.5)
linearModel = regression.fit(prepared_reviews)

print("coefficients: " + str(linearModel.coefficients))
print("intercept: " + str(linearModel.intercept))

+--------------------+--------------------+--------------------+
|            comments|               words|               model|
+--------------------+--------------------+--------------------+
|Excellent course....|[excellent, cours...|[0.07188699306623...|
|worthy enrolling ...|[worthy, enrollin...|[0.10604066448286...|
|Very informative ...|[very, informativ...|[-0.0231658152949...|
|Clear concise and...|[clear, concise, ...|[0.02680088599630...|
|I think this cour...|[i, think, this, ...|[0.10638202540576...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

coefficients: [0.0,0.0,0.0,0.0,1.1225630310265953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.4562507819602137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3805755555539179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0

In [None]:
prepared_reviews.columns

['label', 'comments', 'id', 'words', 'model']

### Evaluating linear regression model

In [None]:
trainingSummary = linearModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.825487
r2: 0.085834


In [None]:
train_predict = linearModel.transform(prepared_reviews)
train_predict.select("prediction", "label", "model").show()

+------------------+-----+--------------------+
|        prediction|label|               model|
+------------------+-----+--------------------+
| 4.598560979622734|    5|[0.07188699306623...|
| 4.735281438981742|    4|[0.10604066448286...|
| 4.657333868577057|    4|[-0.0231658152949...|
| 4.610328518850617|    5|[0.02680088599630...|
| 4.626508865595196|    5|[0.10638202540576...|
| 4.533753977265069|    5|[0.01472329441457...|
| 4.570310760248255|    4|[0.01319493338898...|
| 4.486518133759836|    5|[8.09852240814103...|
| 4.409366283336921|    3|[0.00687562356636...|
| 4.466434298227776|    5|[-0.0104584770339...|
| 4.540266883884273|    5|[0.01682652394040...|
| 4.601135136909387|    4|[0.07849886112559...|
|4.7216647135005525|    5|[-0.0129273744443...|
|  4.59559739843674|    5|[-0.0083961978397...|
|4.5352972389821975|    5|[0.06108043555702...|
| 4.684430671966759|    5|[0.10123305656015...|
| 4.855857876039043|    5|[0.22891789426406...|
| 4.445840333618247|    5|[0.00492196508

### logistic regression

In [None]:
lr = LogisticRegression(featuresCol="model", labelCol="label",maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(prepared_reviews)
print("Multinomial coefficients: " + str(lrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(lrModel.interceptVector))

Multinomial coefficients: 6 X 100 CSRMatrix

Multinomial intercepts: [-5.832772742903181,-0.03915056495430942,-0.1880622090410842,0.7078640743818978,1.952217397488975,3.399904045027701]


### Evaluating logistic regression model

In [None]:
trainingSummary = lrModel.summary
train_predict = lrModel.transform(prepared_reviews)
train_predict.select("prediction", "label", "model").show()

+----------+-----+--------------------+
|prediction|label|               model|
+----------+-----+--------------------+
|       5.0|    5|[0.07188699306623...|
|       5.0|    4|[0.10604066448286...|
|       5.0|    4|[-0.0231658152949...|
|       5.0|    5|[0.02680088599630...|
|       5.0|    5|[0.10638202540576...|
|       5.0|    5|[0.01472329441457...|
|       5.0|    4|[0.01319493338898...|
|       5.0|    5|[8.09852240814103...|
|       5.0|    3|[0.00687562356636...|
|       5.0|    5|[-0.0104584770339...|
|       5.0|    5|[0.01682652394040...|
|       5.0|    4|[0.07849886112559...|
|       5.0|    5|[-0.0129273744443...|
|       5.0|    5|[-0.0083961978397...|
|       5.0|    5|[0.06108043555702...|
|       5.0|    5|[0.10123305656015...|
|       5.0|    5|[0.22891789426406...|
|       5.0|    5|[0.00492196508372...|
|       5.0|    3|[0.00320507082086...|
|       5.0|    5|[0.12744884720693...|
+----------+-----+--------------------+
only showing top 20 rows



### RandomForest classifier

In [None]:
lr = RandomForestClassifier(featuresCol="model", labelCol="label")
lrModel = lr.fit(prepared_reviews)

train_predict = lrModel.transform(prepared_reviews)
train_predict.select("prediction", "label", "model").show()

+----------+-----+--------------------+
|prediction|label|               model|
+----------+-----+--------------------+
|       5.0|    5|[0.07188699306623...|
|       5.0|    4|[0.10604066448286...|
|       5.0|    4|[-0.0231658152949...|
|       5.0|    5|[0.02680088599630...|
|       5.0|    5|[0.10638202540576...|
|       5.0|    5|[0.01472329441457...|
|       5.0|    4|[0.01319493338898...|
|       5.0|    5|[8.09852240814103...|
|       4.0|    3|[0.00687562356636...|
|       5.0|    5|[-0.0104584770339...|
|       5.0|    5|[0.01682652394040...|
|       5.0|    4|[0.07849886112559...|
|       5.0|    5|[-0.0129273744443...|
|       5.0|    5|[-0.0083961978397...|
|       5.0|    5|[0.06108043555702...|
|       5.0|    5|[0.10123305656015...|
|       5.0|    5|[0.22891789426406...|
|       5.0|    5|[0.00492196508372...|
|       5.0|    3|[0.00320507082086...|
|       5.0|    5|[0.12744884720693...|
+----------+-----+--------------------+
only showing top 20 rows



## second approach

### linear regression

In [None]:
pipeline = Pipeline(stages=[
    Tokenizer(inputCol="comments", outputCol="words"),
    Word2Vec(inputCol="words", outputCol="model"),
    LinearRegression(featuresCol="model",labelCol="label")
])

#### Parameter grid builder for linear regression

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

param_grid = ParamGridBuilder()\
    .addGrid("regParam", [0.1, 0.01]) \
    .addGrid("fitIntercept", [False, True])\
    .addGrid("elasticNetParam", [0.0, 0.5, 1.0])\
    .build()

In [None]:
debug_data = df.sample(0.9).cache() #using a sample of the data

In [None]:
train, test = debug_data.randomSplit([0.9, 0.1], seed=12345) # splitting the data to train and test sets

In [None]:
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

LinearRegressionmodel = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator()
).fit(train)

### Evaluating linear regression model

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

MulticlassClassificationEvaluator(metricName="accuracy")

MulticlassClassificationEvaluator_400b99eef7d3b1d7472d

In [None]:
LinearRegressionmodel.validationMetrics

[0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097,
 0.7504862642290097]

In [None]:
LinearRegressionmodel.getEvaluator().getMetricName()

'rmse'

In [None]:
LinearRegressionmodel.transform(test)\
    .select("prediction", "label", "model")\
    .show()

+------------------+-----+--------------------+
|        prediction|label|               model|
+------------------+-----+--------------------+
| 4.014135256778684|    1|[0.02868496689875...|
| 4.556017619883908|    1|[-0.0603504687404...|
| 4.329471912792087|    1|[-0.0301186458002...|
| 4.356063556399902|    1|[-0.0062476471904...|
|4.1000745965343395|    1|[-0.0042385605629...|
| 3.959159932155589|    1|[-0.0198777261456...|
| 4.081534317048458|    1|[-0.0177130518853...|
|3.8599964180971535|    1|[-0.0156535421582...|
| 4.516402158410314|    1|[-0.0369633138179...|
| 4.267586465280406|    1|[-0.0821046175169...|
|  3.44993656676805|    1|[-0.0099211590070...|
|3.5390500406853374|    1|[-0.0327959347187...|
| 4.307060100593686|    1|[-0.0207394393250...|
| 4.145324964607281|    1|[-0.0280712807347...|
| 4.145588719889501|    1|[0.00129717584083...|
| 4.160680292527071|    1|[-0.0550121396387...|
| 4.612759815941149|    1|[-0.0157080934870...|
| 4.098559703318198|    1|[-0.0555058641

In [None]:
predictions_linearRegression = LinearRegressionmodel.transform(test)
my_eval_nb = RegressionEvaluator(labelCol='label',metricName="rmse")
print("RMSE is : "+ str(my_eval_nb.evaluate(predictions_linearRegression)))

RMSE is : 0.788112569824


### Logistic regression model

In [None]:
pipeline = Pipeline(stages=[
    Tokenizer(inputCol="comments", outputCol="words"),
    Word2Vec(inputCol="words", outputCol="model"),
    LogisticRegression(featuresCol="model",labelCol="label")
])

In [None]:
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

LogisticRegressionmodel = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator(labelCol="label")
).fit(train)

### Evaluating logistic regression model

In [None]:
MulticlassClassificationEvaluator(metricName="accuracy")

MulticlassClassificationEvaluator_4d049eb4a474e2d11760

In [None]:
LogisticRegressionmodel.validationMetrics

[0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286,
 0.8938933639210286]

In [None]:
LogisticRegressionmodel.getEvaluator().getMetricName()

'rmse'

In [None]:
LogisticRegressionmodel.transform(test)\
    .select("prediction", "label", "model")\
    .show()

+----------+-----+--------------------+
|prediction|label|               model|
+----------+-----+--------------------+
|       5.0|    1|[0.02868496689875...|
|       5.0|    1|[-0.0603504687404...|
|       5.0|    1|[-0.0301186458002...|
|       5.0|    1|[-0.0062476471904...|
|       5.0|    1|[-0.0042385605629...|
|       5.0|    1|[-0.0198777261456...|
|       5.0|    1|[-0.0177130518853...|
|       1.0|    1|[-0.0156535421582...|
|       5.0|    1|[-0.0369633138179...|
|       5.0|    1|[-0.0821046175169...|
|       4.0|    1|[-0.0099211590070...|
|       4.0|    1|[-0.0327959347187...|
|       5.0|    1|[-0.0207394393250...|
|       5.0|    1|[-0.0280712807347...|
|       5.0|    1|[0.00129717584083...|
|       5.0|    1|[-0.0550121396387...|
|       5.0|    1|[-0.0157080934870...|
|       5.0|    1|[-0.0555058641030...|
|       5.0|    1|[-0.0116023177703...|
|       1.0|    2|[-0.0088467702050...|
+----------+-----+--------------------+
only showing top 20 rows



In [None]:
predictions_LogisticRegression = LogisticRegressionmodel.transform(test)
my_eval_nb = RegressionEvaluator(labelCol='label',metricName="rmse")
print("RMSE is :  " + str(my_eval_nb.evaluate(predictions_LogisticRegression)))

RMSE is :  0.948410177763


### Random forest classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier
pipeline = Pipeline(stages=[
    Tokenizer(inputCol="comments", outputCol="words"),
    Word2Vec(inputCol="words", outputCol="model"),
    RandomForestClassifier(featuresCol="model",labelCol="label",numTrees=50)
])

#### building our grid for random forest classifier

In [None]:
# param_grid = (ParamGridBuilder()
#              .addGrid('maxDepth', [2, 5, 10, 20, 30])
#              .addGrid('maxBins', [10, 20, 40, 80, 100])
#              .addGrid('numTrees', [5, 20, 50, 100, 500])
#              .build())
#this one takes too much time so i did a smaller oner
param_grid = (ParamGridBuilder()
             .addGrid('maxDepth', [2, 5])
             .addGrid('maxBins', [10, 20])
             .addGrid('numTrees', [5, 20])
             .build())

In [None]:
RandomForestClassifiermodel = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=MulticlassClassificationEvaluator(labelCol="label")
).fit(train)

### Evaluating random forest classifier model

In [None]:
RandomForestClassifiermodel.validationMetrics

[0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725,
 0.6490119547548725]

In [None]:
RandomForestClassifiermodel.getEvaluator().getMetricName()

'f1'

In [None]:
RandomForestClassifiermodel.transform(test)\
    .select("prediction", "label", "model")\
    .show()

+----------+-----+--------------------+
|prediction|label|               model|
+----------+-----+--------------------+
|       5.0|    1|[0.02868496689875...|
|       5.0|    1|[-0.0603504687404...|
|       5.0|    1|[-0.0301186458002...|
|       5.0|    1|[-0.0062476471904...|
|       5.0|    1|[-0.0042385605629...|
|       5.0|    1|[-0.0198777261456...|
|       5.0|    1|[-0.0177130518853...|
|       5.0|    1|[-0.0156535421582...|
|       5.0|    1|[-0.0369633138179...|
|       5.0|    1|[-0.0821046175169...|
|       5.0|    1|[-0.0099211590070...|
|       4.0|    1|[-0.0327959347187...|
|       5.0|    1|[-0.0207394393250...|
|       4.0|    1|[-0.0280712807347...|
|       5.0|    1|[0.00129717584083...|
|       5.0|    1|[-0.0550121396387...|
|       5.0|    1|[-0.0157080934870...|
|       5.0|    1|[-0.0555058641030...|
|       5.0|    1|[-0.0116023177703...|
|       5.0|    2|[-0.0088467702050...|
+----------+-----+--------------------+
only showing top 20 rows



In [None]:
predictions_RandomForestClassifier = RandomForestClassifiermodel.transform(test)
my_eval_nb = RegressionEvaluator(labelCol='label',metricName="rmse")
print("RMSE is : " + str(my_eval_nb.evaluate(predictions_RandomForestClassifier)))

RMSE is : 0.93962841294


### Decision tree classifier

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
pipeline = Pipeline(stages=[
    Tokenizer(inputCol="comments", outputCol="words"),
    Word2Vec(inputCol="words", outputCol="model"),
    DecisionTreeClassifier(featuresCol="model",labelCol="label")
])

In [None]:
# param_grid = (ParamGridBuilder()
#              .addGrid('maxDepth', [2, 5, 10, 20, 30])
#              .addGrid('maxBins', [10, 20, 40, 80, 100])
#              .build())
#this grid takes too much time so i did a smaller one

param_grid = (ParamGridBuilder()
             .addGrid('maxDepth', [2, 5])
             .addGrid('maxBins', [10, 20])
             .build())

In [None]:
DecisionTreeClassifiermodel = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=MulticlassClassificationEvaluator(labelCol="label")
).fit(train)

### Evaluating decision tree classifier

In [None]:
DecisionTreeClassifiermodel.validationMetrics

[0.6703902149711483,
 0.6703902149711483,
 0.6703902149711483,
 0.6703902149711483]

In [None]:
DecisionTreeClassifiermodel.getEvaluator().getMetricName()

'f1'

In [None]:
DecisionTreeClassifiermodel.transform(test)\
    .select("prediction", "label", "model")\
    .show()

+----------+-----+--------------------+
|prediction|label|               model|
+----------+-----+--------------------+
|       5.0|    1|[0.02868496689875...|
|       5.0|    1|[-0.0603504687404...|
|       5.0|    1|[-0.0301186458002...|
|       5.0|    1|[-0.0062476471904...|
|       4.0|    1|[-0.0042385605629...|
|       5.0|    1|[-0.0198777261456...|
|       4.0|    1|[-0.0177130518853...|
|       5.0|    1|[-0.0156535421582...|
|       5.0|    1|[-0.0369633138179...|
|       5.0|    1|[-0.0821046175169...|
|       4.0|    1|[-0.0099211590070...|
|       4.0|    1|[-0.0327959347187...|
|       5.0|    1|[-0.0207394393250...|
|       4.0|    1|[-0.0280712807347...|
|       5.0|    1|[0.00129717584083...|
|       5.0|    1|[-0.0550121396387...|
|       5.0|    1|[-0.0157080934870...|
|       4.0|    1|[-0.0555058641030...|
|       4.0|    1|[-0.0116023177703...|
|       4.0|    2|[-0.0088467702050...|
+----------+-----+--------------------+
only showing top 20 rows



In [None]:
predictions_DecisionTreeClassifier = DecisionTreeClassifiermodel.transform(test)
print("RMSE is: "+ str (my_eval_nb.evaluate(predictions_DecisionTreeClassifier)))

RMSE is: 0.896163382587


In [None]:
my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
print("Accuracy for RandomForestClassifier is : " + str(my_mc_nb.evaluate(predictions_RandomForestClassifier)))
print("Accuracy for LogisticRegression is : " + str(my_mc_nb.evaluate(predictions_LogisticRegression)))
print("Accuracy for DecisionTreeClassifier is : " + str(my_mc_nb.evaluate(predictions_DecisionTreeClassifier)))

Accuracy for RandomForestClassifier is : 0.723316062176
Accuracy for LogisticRegression is : 0.721243523316
Accuracy for DecisionTreeClassifier is : 0.708808290155


## Best accuracy score is for random forest classifier

In [None]:
print("RMSE for RandomForestClassifier is : " + str(my_eval_nb.evaluate(predictions_RandomForestClassifier)))
print("RMSE for linearRegression is : "+ str(my_eval_nb.evaluate(predictions_linearRegression)))
print("RMSE for LogisticRegression is :  " + str(my_eval_nb.evaluate(predictions_LogisticRegression)))
print("RMSE for DecisionTreeClassifier is: "+ str (my_eval_nb.evaluate(predictions_DecisionTreeClassifier)))

RMSE for RandomForestClassifier is : 0.93962841294
RMSE for linearRegression is : 0.788112569824
RMSE for LogisticRegression is :  0.948410177763
RMSE for DecisionTreeClassifier is: 0.896163382587


## Best RMSE score is for linearRegression

### we can conclude that linear regression was the best model.