# Stock Market Predictions using Sentiment Scores
- **Devendra Govil**
- **Akshay Pamnani**

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions
from pyspark.ml.feature import Imputer, OneHotEncoder, StandardScaler, VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

## Process Reddit Data

In [0]:
ss = SparkSession.builder.getOrCreate()

In [0]:
# REF_STRING = "mongodb+srv://admin:<password>@msds697-cluster.qzgwq.mongodb.net/"
# Credit: Taken from Teammates Youshi and Guru
def read_df_from_mongo(spark, collection_name):
    mongo_username = 'admin'
    mongo_password =  'msds697'
    mongo_ip_address = 'msds697-cluster.qzgwq.mongodb.net/'
    MONGO_DB_NAME = "msds697_project"
    connection_string = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}{MONGO_DB_NAME}.{collection_name}"
    spark_df = spark.read.format("mongo").option("uri", connection_string).load()
    return spark_df
    
def store_df_to_mongo(spark, spark_df, collection_name):
    mongo_username = 'admin'
    mongo_password =  'msds697'
    mongo_ip_address = 'msds697-cluster.qzgwq.mongodb.net/'
    MONGO_DB_NAME = "msds697_project"
    connection_string = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_ip_address}{MONGO_DB_NAME}.{collection_name}"
    spark_df.write.format("com.mongodb.spark.sql.DefaultSource")\
                     .mode("append")\
                     .option("uri", connection_string)\
                     .save()

In [0]:
df_snp_500 = read_df_from_mongo(ss, 'snp_500_data')

In [0]:
df_snp_500.show()

In [0]:
df_snp_500.distinct().count()

In [0]:
df_sentiment_target = read_df_from_mongo(ss, 'sentiment_target_data')

In [0]:
df_reddit_sentiments = read_df_from_mongo(ss, 'reddit_post_sentiment_to_tickers')

In [0]:
df_reddit_sentiments.show(5)

In [0]:
df_sentiment_target.show(5)

In [0]:
df_snp_500.createOrReplaceTempView("df_snp")
df_sentiment_target.createOrReplaceTempView("df_senti_target")

In [0]:
model_table = spark.sql("""
    SELECT *
    FROM df_snp
    INNER JOIN df_senti_target
    ON df_snp.Symbol = df_senti_target.ticker
""")


In [0]:
model_table2 = model_table.dropDuplicates(["Symbol"])

In [0]:
model_table2.createOrReplaceTempView("model_table2")

In [0]:
column_names = spark.sql("SELECT * FROM model_table2 LIMIT 0").columns


In [0]:
column_names

In [0]:
df_reddit_sentiments.show(5)

In [0]:
all_data = model_table2.join(df_reddit_sentiments, model_table2["ticker"]==df_reddit_sentiments["Symbol"], "left_outer")

In [0]:
all_data.count()

In [0]:
all_data.columns

In [0]:
all_data2 = all_data.withColumn("label", ((100*(all_data["final_close"]- all_data["monday_open"]))/all_data["monday_open"]))

In [0]:
all_data2.show(5)

In [0]:
all_data3 = all_data2.select(
#     Company Ticker
    df_snp_500["Symbol"],
    
#     Continuous Variables
    '52 Week High',
 '52 Week Low',
    'Dividend Yield',
 'EBITDA',
 'Earnings/Share',
        'Market Cap',
    'Price',
 'Price/Book',
 'Price/Earnings',
 'Price/Sales',

#     Sentiment Scores columns are below
    'negativity_sentiment_score',
 'neutral_sentiment_score',
 'positivity_sentiment_score',
    'negative_sentiment_reddit',
    'neutral_sentiment_reddit',
    'positive_sentiment_reddit',
    
#     target present below
    "label"
    
)

**To Note: Until now we have not cache our data, we cache our data now to avoid multiple re-runs of the dag right from the start. This should significantly boost our runs going forward**

In [0]:
all_data3.cache()

In [0]:
all_data3.show(2)

In [0]:
all_data3.show(1)

**As can be screen from time stamps above, the first show command takes 3.03 seconds while the second (after the data has been cached) takes only 0.10 seconds**

In [0]:
all_data4 = all_data3.dropna(how='any', subset=['label']).cache()

In [0]:
all_data4.show(4)

## Feature Engineering approaches required for our dataset
- Fortunately we don't have any categorical variables, so we don't need to deal with them. 
- We need to perform Standard Scaling on all our numeric features 
- We need to impute missing values (in our data only for numeric features)
- We would also finally require vector assembler to get our features column

**Note: all feature engineering apporaches are fit using only the training data and the statistics/fit derived from training data is used to transform test data**

In [0]:
split_data = all_data4.randomSplit([0.8, 0.2])

In [0]:
train_data_og = split_data[0].cache()
test_data_og = split_data[1].cache()

**To Note: We use cache here not only to get better performance, but also to make sure that we get consistent results, as if we were not to cache it, the random splitting while re-computing the dag may result in a different split. Plus, iterative algorithms can also lead to multiple re-computations of the dag. Hence, caching is very important at this step.**

First fitting all feature engineering/imputation techniques on train data

In [0]:
all_data4.columns

# Fitting Model on all columns (including both sentiment scores)

In [0]:
cols_to_impute = [
    'negativity_sentiment_score',
     'neutral_sentiment_score',
     'positivity_sentiment_score',
     'negative_sentiment_reddit',
     'neutral_sentiment_reddit',
     'positive_sentiment_reddit',
    'Price/Book'
]
cols_to_scale = [
    '52 Week High',
     '52 Week Low',
     'Dividend Yield',
     'EBITDA',
     'Earnings/Share',
     'Market Cap',
     'Price',
     'Price/Book',
     'Price/Earnings',
     'Price/Sales',
    'negativity_sentiment_score',
     'neutral_sentiment_score',
     'positivity_sentiment_score',
     'negative_sentiment_reddit',
     'neutral_sentiment_reddit',
     'positive_sentiment_reddit',
]

In [0]:
train_data = train_data_og
test_data = test_data_og

In [0]:
imputer = Imputer(inputCols=cols_to_impute, outputCols=cols_to_impute).fit(train_data)


In [0]:
train_data  = imputer.transform(train_data)

In [0]:
va = VectorAssembler(inputCols=cols_to_scale, outputCol='assembled_col')

In [0]:
train_data = va.transform(train_data).cache()

In [0]:
std_scaler = StandardScaler(inputCol='assembled_col', outputCol='features').fit(train_data)

In [0]:
train_data = std_scaler.transform(train_data).cache()

In [0]:
train_data.show(4)

In [0]:
test_data = imputer.transform(test_data)
test_data = va.transform(test_data)
test_data = std_scaler.transform(test_data).cache()

In [0]:
test_data.show(4)

In [0]:
lr_model = LinearRegression()
lr_model.setPredictionCol("pred_label")

In [0]:
lr_model_fit = lr_model.fit(train_data)

In [0]:
lr_model_fit.summary.rootMeanSquaredError

In [0]:
final_pred = lr_model_fit.transform(test_data)

In [0]:
eval_test = lr_model_fit.evaluate(test_data)

In [0]:
eval_test.rootMeanSquaredError

In [0]:
rf = RandomForestRegressor(featuresCol="features", numTrees=400).fit(train_data)

In [0]:
train_rf = rf.transform(train_data)

In [0]:
train_rf.show(4)

In [0]:
eval = RegressionEvaluator()

In [0]:
eval.evaluate(train_rf)

In [0]:
test_rf = rf.transform(test_data)

In [0]:
eval.evaluate(test_rf)

## Summary of observations:
1. While fitting the Linear Regression Model we get a RMSE of:
  - 2.82 on train
  - 2.717 on test
2. While fitting a Random Forest Regressor we get a RMSE of:
  - 1.87 on train
  - 2.38 on test
3. We can clearly see that Linear Regression Model doesn't overfit as the test performance actually increases, however RF regressor does overfit a little because the test performance is poorer than the train performance

# Future:
1. Our next steps include building a knowledge graph and using that to further improve our predictions
2. Iterate over and improve the sentiment scores further