In [0]:
#import all libraries here
pip install nltk

Part 1: Join the reddit Glossier Comments dataset sentiment+ the competitor comments Data set senemtiment with the Google trends data set

In [0]:
#import the glossier comment dataset 
glos_comm = spark.read.parquet("/FileStore/glossier/glossier_comments")
#import the competitor comment data set 
competitor_comments = spark.read.parquet("dbfs:/FileStore/glossier/competitor_comments")
#import the google trends data set 
google = spark.read.csv('/FileStore/google_trends_competitors.csv', header='true')
google.show(5)

+----------+-------+----+-----+--------+
|      Date|Sephora|Ulta|Fenty|Glossier|
+----------+-------+----+-----+--------+
|2021-01-01|     88| 100|   70|      35|
|2021-01-02|     97|  96|  100|      41|
|2021-01-03|    100|  96|   84|      44|
|2021-01-04|     85|  74|   72|      32|
|2021-01-05|     84|  70|   58|      35|
+----------+-------+----+-----+--------+
only showing top 5 rows



In [0]:
#Data cleaning 
#import libraries
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import udf, col, lower, regexp_replace, translate
import re 
#preliminary data cleaning in order to join appropriate colunms 
#Glossier 
#we know all we are interested in is tge body and the date 
glos_comm2 = glos_comm.select("body","created_utc")
#change type 
glos_comm3 = glos_comm2.withColumn("created_utc",glos_comm2.created_utc.cast('timestamp'))
#glos_comm3.show(5)
## converting all words to lowercase
glos_comm4 = glos_comm3 .withColumn("body",lower(translate('body', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', ' ')))
glos_comm4.show(5)
#Now we can move on to doing actual sentiment analysis 

+--------------------+-------------------+
|                body|        created_utc|
+--------------------+-------------------+
|i just got an ema...|2021-07-18 16:16:10|
|milk kush is also...|2021-07-18 16:21:10|
|i have dry skin a...|2021-07-18 16:23:52|
|hi all  if you’d ...|2022-04-04 22:56:35|
|glossier balm dot...|2022-04-04 22:58:38|
+--------------------+-------------------+
only showing top 5 rows



In [0]:
#Now create the nlp pipeline
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.pretrained(name='sentimentdl_use_twitter', lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ | ][OK!]


In [0]:
# running the pipeline
from pyspark.sql.functions import col

empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

data = glos_comm4.select(col("body").alias("text"))
result= pipelineModel.transform(data)


In [0]:
import pyspark.sql.functions as F
result = result.select('text', F.explode('sentiment.result').alias("sentiment"))
result.show(5)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|i just got an ema...| negative|
|milk kush is also...| positive|
|i have dry skin a...| negative|
|hi all  if you’d ...| positive|
|glossier balm dot...| negative|
+--------------------+---------+
only showing top 5 rows



In [0]:
#Now we need to join the gloss_com4 data set with our result 
#we are going to join on body 
#so reamke column in result 
result.createOrReplaceTempView("result_vw")
glos_comm4.createOrReplaceTempView("glossier_vw")
glossier_final= spark.sql("select glossier_vw.*, result_vw.sentiment \
                    from glossier_vw join result_vw on glossier_vw.body = result_vw.text")
glossier_final.show()


+--------------------+-------------------+---------+
|                body|        created_utc|sentiment|
+--------------------+-------------------+---------+
|\n garnier micell...|2021-06-22 00:09:09|  neutral|
|\ndaniel sandler ...|2021-11-17 03:49:16| positive|
|\nso i love color...|2021-03-26 22:21:13| positive|
|\nthis post has b...|2022-08-30 21:54:39| negative|
|\r\n\nfinished of...|2022-06-07 21:04:19| positive|
|  aw i was hoping...|2022-07-29 14:24:30| positive|
| byredo bal dafri...|2021-04-09 03:32:46| positive|
| first off both l...|2021-06-10 05:26:51|  neutral|
| glossier you\n a...|2022-01-17 15:12:34| positive|
| i have the exact...|2022-02-13 23:42:41| positive|
| i think people g...|2021-11-19 16:58:27|  neutral|
| iso  shipping to...|2021-04-01 00:34:38| positive|
| jane iredale spi...|2021-05-18 04:59:01| positive|
| molecule 01  iri...|2022-07-28 14:48:01| positive|
| murad invisiblur...|2022-06-30 18:47:41| negative|
| pink hoodie  lov...|2021-11-28 23:09:49| pos

In [0]:
#Now I am going to make the sentiment into a dummy variables 
#2= positive 
#1= neutral 
#0=negative
glossier_final2=glossier_final.na.replace("negative","0")
glossier_final3=glossier_final2.na.replace("neutral","1")
glossier_final4=glossier_final3.na.replace("positive","2")
#change sentiment type to integer
glossier_final5=glossier_final4.withColumn("sentiment",col("sentiment").cast("int"))
glossier_final5.show()

+--------------------+-------------------+---------+
|                body|        created_utc|sentiment|
+--------------------+-------------------+---------+
|\n garnier micell...|2021-06-22 00:09:09|        1|
|\ndaniel sandler ...|2021-11-17 03:49:16|        2|
|\nso i love color...|2021-03-26 22:21:13|        2|
|\nthis post has b...|2022-08-30 21:54:39|        0|
|\r\n\nfinished of...|2022-06-07 21:04:19|        2|
|  aw i was hoping...|2022-07-29 14:24:30|        2|
| byredo bal dafri...|2021-04-09 03:32:46|        2|
| first off both l...|2021-06-10 05:26:51|        1|
| glossier you\n a...|2022-01-17 15:12:34|        2|
| i have the exact...|2022-02-13 23:42:41|        2|
| i think people g...|2021-11-19 16:58:27|        1|
| iso  shipping to...|2021-04-01 00:34:38|        2|
| jane iredale spi...|2021-05-18 04:59:01|        2|
| molecule 01  iri...|2022-07-28 14:48:01|        2|
| murad invisiblur...|2022-06-30 18:47:41|        0|
| pink hoodie  lov...|2021-11-28 23:09:49|    

In [0]:
#Now the last step before the join would be to aggraget based on the day 
#for the sentiment column I am going to take the avg for the day
from pyspark.sql import functions as F
glossier_final6=glossier_final5.select(F.date_format('created_utc','yyyy-MM-dd').alias('day'),'sentiment').groupby('day').mean('sentiment')
glossier_final6.show()
#now 

+----------+--------------------+
|       day|      avg(sentiment)|
+----------+--------------------+
|2021-11-03|  1.7736081152490102|
|2021-12-23|   1.852250885179565|
|2022-05-17|  1.8446363000180848|
|2021-04-06|  1.5618166013450658|
|2022-03-30|  1.5942850910677282|
|2021-11-15|  1.5524269174022856|
|2021-10-25|  1.7197809726068958|
|2021-02-18|0.030630739487675206|
|2021-08-30|  1.3847968297899107|
|2021-09-23|   1.537916250197275|
|2022-07-04|  1.6347265221878224|
|2022-07-08|  1.5617837093528228|
|2022-01-20|    1.77389811104751|
|2021-01-15| 0.05284008779913272|
|2022-07-30|  1.6973674037420752|
|2021-02-13|  1.6691449814126393|
|2022-07-23|  1.5777883328295694|
|2022-03-21|  1.6067963833237144|
|2021-10-27|  1.8363393309077694|
|2021-08-06|  1.5152900986280236|
+----------+--------------------+
only showing top 20 rows



In [0]:
#Now we turn this into a pnadas dataframe 
glossier_pd_final = glossier_final6.toPandas()
glossier_pd_final.head(20)

Unnamed: 0,day,avg(sentiment)
0,2021-11-03,1.773608
1,2021-12-23,1.852251
2,2022-05-17,1.844636
3,2021-04-06,1.561817
4,2021-10-25,1.719781
5,2021-11-15,1.552427
6,2022-03-30,1.594285
7,2021-08-30,1.384797
8,2021-02-18,0.030631
9,2021-09-23,1.537916


In [0]:
#Now we have to do the same process on the competitors comments ! 
#A lot of this code is taken from Q4_final (1)
#As a result I have already checked for null values ect 
competitor_list = ["Makeup", "MakeupAddiction"]
competitor_comments2 = competitor_comments.filter(competitor_comments.subreddit.isin(competitor_list))
competitor_comments3=competitor_comments2.select("body","created_utc")
#now we need to filter based on Sephora, Ulta, Fenty and Glossier
from pyspark.sql.functions import col
 
competitor_comments4=competitor_comments3.withColumn("Sephora",col("body").rlike("Sephora|sephora"))
competitor_comments5=competitor_comments4.withColumn("Ulta",col("body").rlike("Ulta|ulta"))
competitor_comments6=competitor_comments5.withColumn("Fenty",col("body").rlike("Fenty|fenty"))
competitor_comments7=competitor_comments6.withColumn("Glossier",col("body").rlike("Glossier|glossier"))
competitor_comments7.show(40)

+--------------------+-----------+-------+-----+-----+--------+
|                body|created_utc|Sephora| Ulta|Fenty|Glossier|
+--------------------+-----------+-------+-----+-----+--------+
|This is the nices...| 1650684062|  false|false|false|   false|
|        Thank you 😊| 1650684086|  false|false|false|   false|
|Thank you🥺y’all ...| 1650684158|  false|false|false|   false|
|i think both look...| 1650684185|  false|false|false|   false|
|Hah never heard t...| 1650684185|  false|false|false|   false|
|You look gorgeous...| 1650684310|  false|false|false|   false|
|Have you tried Il...| 1650684313|  false|false|false|   false|
|Robert Welsh has ...| 1650684341|  false|false|false|   false|
|Have you consider...| 1650684416|  false|false|false|   false|
|I love the blonde...| 1650684455|  false|false|false|   false|
|               Thick| 1650684620|  false|false|false|   false|
|***Thank you for ...| 1650684734|  false|false|false|   false|
|You can rock both...| 1650684766|  false|

In [0]:
#Now we need to fiter based on whether is says true 
competitor_comments_Sephora=competitor_comments7.filter("Sephora==True")
#Now lets replace the True with Sephora 
competitor_comments_Sephora2=competitor_comments_Sephora.withColumn("Sephora",col("Sephora").cast("string"))
competitor_comments_Sephora3=competitor_comments_Sephora2.na.replace("true","Sephora")
competitor_comments_Sephora4=competitor_comments_Sephora3.select("body","created_utc","Sephora")
competitor_comments_Sephora4.show(20)
competitor_comments_Ulta=competitor_comments7.filter("Ulta==True")
competitor_comments_Ulta2=competitor_comments_Ulta.withColumn("Ulta",col("Ulta").cast("string"))
competitor_comments_Ulta3=competitor_comments_Ulta2.na.replace("true","Ulta")
competitor_comments_Ulta4=competitor_comments_Ulta3.select("body","created_utc","Ulta")
competitor_comments_Ulta4.show(20)
competitor_comments_Fenty=competitor_comments7.filter("Fenty==True")
competitor_comments_Fenty2=competitor_comments_Fenty.withColumn("Fenty",col("Fenty").cast("string"))
competitor_comments_Fenty3=competitor_comments_Fenty2.na.replace("true","Fenty")
competitor_comments_Fenty4=competitor_comments_Fenty3.select("body","created_utc","Fenty")
competitor_comments_Fenty4.show(20)
competitor_comments_Glossier=competitor_comments7.filter("Glossier==True")
competitor_comments_Glossier2=competitor_comments_Glossier.withColumn("Glossier",col("Glossier").cast("string"))
competitor_comments_Glossier3=competitor_comments_Glossier2.na.replace("true","Glossier")
competitor_comments_Glossier4=competitor_comments_Glossier3.select("body","created_utc","Glossier")
competitor_comments_Glossier4.show(20)
#Now lets 

+--------------------+-----------+-------+
|                body|created_utc|Sephora|
+--------------------+-----------+-------+
|Honestly that’s w...| 1626234041|Sephora|
|My advice is to g...| 1649364278|Sephora|
|I'm not an expert...| 1618713621|Sephora|
|Hey that's incred...| 1660949791|Sephora|
|Tell them you pur...| 1660949878|Sephora|
|I can't help but ...| 1660949894|Sephora|
|I'm really sad th...| 1660949978|Sephora|
|Sephora brand is ...| 1647907798|Sephora|
|Buy Freck! It’s u...| 1648093418|Sephora|
|[Auric Glow Lust ...| 1622834012|Sephora|
|I’ve used this: h...| 1650582232|Sephora|
|I know ☹️ unfortu...| 1649333002|Sephora|
|40s. Everything b...| 1647096398|Sephora|
|I can’t speak to ...| 1648615892|Sephora|
|Face:\nElf porele...| 1652492053|Sephora|
|Some of the other...| 1661046675|Sephora|
|Wording, I meant ...| 1628515971|Sephora|
|I use to work in ...| 1649245020|Sephora|
|I only got the $5...| 1649245275|Sephora|
|Brow: Anastasia D...| 1648780656|Sephora|
+----------

In [0]:
#Okay now I need to stack these dataframes 
#let me rename the colums so they all match 
competitor_comments_Sephora5=competitor_comments_Sephora4.withColumnRenamed("Sephora","Brand")
competitor_comments_Ulta5=competitor_comments_Ulta4.withColumnRenamed("Ulta","Brand")
competitor_comments_Fenty5=competitor_comments_Fenty4.withColumnRenamed("Fenty","Brand")
competitor_comments_Glossier5=competitor_comments_Glossier4.withColumnRenamed("Glossier","Brand")
#Now I need to stack the dataframes 
competitor_temp= competitor_comments_Sephora5.union(competitor_comments_Ulta5)
competitor_temp2= competitor_temp.union(competitor_comments_Fenty5)
competitor_temp3= competitor_temp2.union(competitor_comments_Glossier5)
#one more thing change timestamp and make eveythign in the body lower 
competitor_temp4= competitor_temp3.withColumn("created_utc",competitor_temp3.created_utc.cast('timestamp'))
competitor_temp5= competitor_temp4.withColumn("body",lower(translate('body', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', ' ')))
competitor_temp5.show(5)

+--------------------+-------------------+-------+
|                body|        created_utc|  Brand|
+--------------------+-------------------+-------+
|honestly that’s w...|2021-07-14 03:40:41|Sephora|
|my advice is to g...|2022-04-07 20:44:38|Sephora|
|im not an expert ...|2021-04-18 02:40:21|Sephora|
|hey thats incredi...|2022-08-19 22:56:31|Sephora|
|tell them you pur...|2022-08-19 22:57:58|Sephora|
+--------------------+-------------------+-------+
only showing top 5 rows



In [0]:
#Now we can run the sentiment model 
#Now create the nlp pipeline
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.pretrained(name='sentimentdl_use_twitter', lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ | ][OK!]


In [0]:
# running the pipeline
from pyspark.sql.functions import col

empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

data = competitor_temp5.select(col("body").alias("text"))
result= pipelineModel.transform(data)

In [0]:
import pyspark.sql.functions as F
result = result.select('text', F.explode('sentiment.result').alias("sentiment"))
result.show(5)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|honestly that’s w...| negative|
|my advice is to g...| positive|
|im not an expert ...| positive|
|hey thats incredi...| positive|
|tell them you pur...| positive|
+--------------------+---------+
only showing top 5 rows



In [0]:
#Now we need to join the gloss_com4 data set with our result 
#we are going to join on body 
#so reamke column in result 
result.createOrReplaceTempView("result_vw")
competitor_temp5.createOrReplaceTempView("competitor_vw")
competitor_final= spark.sql("select competitor_vw.*, result_vw.sentiment \
                    from competitor_vw join result_vw on competitor_vw.body = result_vw.text")
competitor_final.show()

+--------------------+-------------------+--------+---------+
|                body|        created_utc|   Brand|sentiment|
+--------------------+-------------------+--------+---------+
|for long term mis...|2022-05-13 08:20:57| Sephora| positive|
|base colourpop no...|2022-04-05 23:57:04| Sephora| positive|
|i get rewards wit...|2022-02-12 22:39:45| Sephora| positive|
|correct me if im ...|2021-12-20 00:37:18| Sephora| positive|
|correct me if im ...|2021-12-20 00:37:18|    Ulta| positive|
|hey there \n\n\nd...|2022-03-21 11:51:54| Sephora| positive|
|thanks i’ve been ...|2021-04-04 20:38:33| Sephora| positive|
|i stopped doing w...|2022-07-23 07:07:28| Sephora| negative|
|i have and love t...|2022-05-13 02:42:26| Sephora| positive|
|scrape wipe same ...|2021-10-02 15:22:09| Sephora| positive|
|scrape wipe same ...|2021-10-02 15:22:09|    Ulta| positive|
|i work for sephor...|2022-08-17 04:26:58| Sephora| positive|
|sephora had a tin...|2021-08-30 19:51:16| Sephora| negative|
|“100 pu

In [0]:
#Now I am going to make the sentiment into a dummy variables 
#2= positive 
#1= neutral 
#0=negative
competitor_final2=competitor_final.na.replace("negative","0")
competitor_final3=competitor_final2.na.replace("neutral","1")
competitor_final4=competitor_final3.na.replace("positive","2")
#change sentiment type to integer
competitor_final5=competitor_final4.withColumn("sentiment",col("sentiment").cast("int"))
competitor_final5.show()

+--------------------+-------------------+--------+---------+
|                body|        created_utc|   Brand|sentiment|
+--------------------+-------------------+--------+---------+
|for long term mis...|2022-05-13 08:20:57| Sephora|        2|
|base colourpop no...|2022-04-05 23:57:04| Sephora|        2|
|correct me if im ...|2021-12-20 00:37:18|    Ulta|        2|
|correct me if im ...|2021-12-20 00:37:18| Sephora|        2|
|thanks i’ve been ...|2021-04-04 20:38:33| Sephora|        2|
|alamar cosmetics’...|2022-06-06 04:16:20| Sephora|        2|
|try a sample at s...|2021-08-19 08:55:55| Sephora|        2|
|✴ top 5 favorites...|2021-11-06 04:29:49|Glossier|        0|
|✴ top 5 favorites...|2021-11-06 04:29:49| Sephora|        0|
|try looking up so...|2022-06-25 02:14:41| Sephora|        2|
|you are the consu...|2021-11-28 03:06:37|    Ulta|        2|
|you are the consu...|2021-11-28 03:06:37| Sephora|        2|
|used modern renai...|2021-03-13 01:40:39| Sephora|        2|
|product

In [0]:
#Okay now I need to aggregate by day but since I also have brand I need to take that into account
#filter by brand first 
c_Sephora=competitor_final5.filter("Brand=='Sephora'")
c_Ulta=competitor_final5.filter("Brand=='Ulta'")
c_Fenty=competitor_final5.filter("Brand=='Fenty'")
c_Glossier=competitor_final5.filter("Brand=='Glossier'")
#Now get score
c_Sephora2=c_Sephora.select(F.date_format('created_utc','yyyy-MM dd').alias('day'),'sentiment').groupby('day').mean('sentiment')
c_Sephora2.show(5)
c_Ulta2=c_Ulta.select(F.date_format('created_utc','yyyy-MM dd').alias('day'),'sentiment').groupby('day').mean('sentiment')
c_Ulta2.show(5)
c_Fenty2=c_Fenty.select(F.date_format('created_utc','yyyy-MM dd').alias('day'),'sentiment').groupby('day').mean('sentiment')
c_Fenty2.show(5)
c_Glossier2=c_Glossier.select(F.date_format('created_utc','yyyy-MM dd').alias('day'),'sentiment').groupby('day').mean('sentiment')
c_Glossier2.show(5)

+----------+------------------+
|       day|    avg(sentiment)|
+----------+------------------+
|2021-05 22|1.2857142857142858|
|2021-02 03|               1.4|
|2022-04 07|               1.6|
|2021-05 07|               1.5|
|2021-05 26|               1.4|
+----------+------------------+
only showing top 5 rows

+----------+------------------+
|       day|    avg(sentiment)|
+----------+------------------+
|2021-01 22|0.8888888888888888|
|2021-11 22|1.4666666666666666|
|2021-04 29|1.7777777777777777|
|2021-02 28|0.8888888888888888|
|2022-04 30|             1.875|
+----------+------------------+
only showing top 5 rows

+----------+------------------+
|       day|    avg(sentiment)|
+----------+------------------+
|2022-07 19|1.7777777777777777|
|2021-04 16|1.8666666666666667|
|2021-03 14|               2.0|
|2021-12 13|              1.84|
|2021-01 02|              1.45|
+----------+------------------+
only showing top 5 rows

+----------+------------------+
|       day|    avg(sentiment

In [0]:
#make them all into pandas dataframes and add a column with the brand name 
#source: https://stackoverflow.com/questions/24039023/add-column-with-constant-value-to-pandas-dataframe
Sephora_c_pd = c_Sephora2.toPandas()
Sephora_c_pd['Brand']='Sephora'
Sephora_c_pd.head(5)
Ulta_c_pd = c_Ulta2.toPandas()
Ulta_c_pd['Brand']='Ulta'
Ulta_c_pd.head(5)
Fenty_c_pd = c_Fenty2.toPandas()
Fenty_c_pd['Brand']='Fenty'
Fenty_c_pd.head(5)
Glossier_c_pd = c_Glossier2.toPandas()
Glossier_c_pd['Brand']='Glossier'
Glossier_c_pd.head(5)

Unnamed: 0,day,avg(sentiment),Brand
0,2022-05 02,0.75,Glossier
1,2022-04 02,1.692308,Glossier
2,2022-06 20,2.0,Glossier
3,2021-10 06,1.727273,Glossier
4,2022-07 02,2.0,Glossier


In [0]:
#now stack these dataframes 
import pandas as pd
comp_final_pd=pd.concat([Sephora_c_pd, Ulta_c_pd], ignore_index=True, axis=0)
#comp_final_pd.count()
comp_final_pd2=pd.concat([comp_final_pd, Fenty_c_pd], ignore_index=True, axis=0)
#comp_final_pd2.count()
comp_final_pd3=pd.concat([comp_final_pd2, Glossier_c_pd], ignore_index=True, axis=0)
#comp_final_pd3.count()
comp_final_pd3.head(20)

Unnamed: 0,day,avg(sentiment),Brand
0,2021-08 30,0.666667,Sephora
1,2022-05 15,1.454545,Sephora
2,2021-02 03,1.4,Sephora
3,2021-10 14,1.15,Sephora
4,2022-04 07,1.6,Sephora
5,2021-05 26,1.4,Sephora
6,2022-07 06,1.684211,Sephora
7,2022-01 26,1.9,Sephora
8,2022-06 01,1.428571,Sephora
9,2022-04 30,1.53125,Sephora


In [0]:
#Okay now all we have left to do is join on the day with the external data 
glossier_pd_final.rename(columns={'day': 'Date'}, inplace=True)
#print(glossier_pd.dtypes)
#make to datetime object so its consistent 
glossier_pd_final['Date']=pd.to_datetime(glossier_pd_final['Date'])
#print(glossier_pd_final.head(20))
comp_final_pd3.rename(columns={'day': 'Date'}, inplace=True)
#print(comp_final_pd3.dtypes)
comp_final_pd3['Date']=pd.to_datetime(comp_final_pd3['Date'])
#print(comp_final_pd3.head(20))
#google = google.toPandas()
#print(google_pd.dtypes)
google_pd['Date']=pd.to_datetime(google_pd['Date'])
#print(google_pd.head(20))
#Now we can do our join 
final_glossier=glossier_pd_final.merge(google_pd, on='Date')
#Since this is just for Glossier, delete columns we dont want such as Sephora, Ulta and Fenty 
final_glossier2=final_glossier.drop(columns=['Sephora', 'Ulta','Fenty'])
final_glossier2.rename(columns={'avg(sentiment)': 'average_sentiment'}, inplace=True)
#Now sort by date 
final_glossier3=final_glossier2.sort_values(by='Date')
print(final_glossier3)
#Now do the competitor data 
final_competitor=comp_final_pd3.merge(google_pd, on='Date')
final_competitor.rename(columns={'avg(sentiment)': 'average_sentiment'}, inplace=True)
#Now sort by date 
final_competitor2=final_competitor.sort_values(by='Date')
print(final_competitor2)

          Date  average_sentiment Glossier
509 2021-01-01           0.013614       35
428 2021-01-02           0.118040       41
300 2021-01-03           0.066365       44
482 2021-01-04           0.023517       32
437 2021-01-05           0.036824       35
..         ...                ...      ...
352 2022-08-27           1.459990       43
473 2022-08-28           1.835665       38
540 2022-08-29           1.486830       41
180 2022-08-30           1.626132       36
329 2022-08-31           1.903914       33

[608 rows x 3 columns]
           Date  average_sentiment     Brand Sephora Ulta Fenty Glossier
935  2021-01-01           1.500000   Sephora      88  100    70       35
936  2021-01-01           1.454545      Ulta      88  100    70       35
937  2021-01-01           1.826087     Fenty      88  100    70       35
938  2021-01-01           1.300000  Glossier      88  100    70       35
1842 2021-01-02           1.700000   Sephora      97   96   100       41
...         ...       

In [0]:
final_glossier3.to_csv("/dbfs/FileStore/google_glossier.csv")
final_competitor2.to_csv("/dbfs/FileStore/google_competitors.csv")