## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [2]:
dbutils.library.installPyPI("mlflow")
dbutils.library.restartPython()
import mlflow

In [3]:
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql import functions as F 
from pyspark.sql import Window

In [4]:
# File location and type
file_location = "/FileStore/tables/RS_v2_2006_03"
file_type = "json"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_train = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_train)

archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,contest_mode,created_utc,distinguished,domain,edited,gilded,hidden,hide_score,id,is_crosspostable,is_reddit_media_domain,is_self,is_video,link_flair_css_class,link_flair_richtext,link_flair_text,link_flair_text_color,link_flair_type,locked,media,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,post_hint,preview,retrieved_on,rte_mode,score,secure_media,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status
True,codepoet,,,,List(),,,text,True,True,False,1141171234,,macgeekery.com,False,0,False,False,2icw,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2icw/well_that_was_a_bust/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Well That Was a Bust,http://www.macgeekery.com/opinion/well_that_was_a_bust,all_ads
True,scylla,,,,List(),,,text,True,True,False,1141171723,,msnbc.msn.com,False,0,False,False,2idn,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2idn/holocaust_why_david_irving_shouldnt_be_jailed_and/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right ),http://www.msnbc.msn.com/id/11569497/site/newsweek/,all_ads
True,tilto,,,,List(),,,text,True,True,False,1141171939,,iht.com,False,0,False,False,2ie4,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2ie4/google_shares_fall_sharply_as_cfo_announces/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Google shares fall sharply as CFO announces growth is slowing,http://www.iht.com/articles/2006/02/28/business/google.php,all_ads
True,Laibcoms,,,,List(),,,text,True,True,False,1141172196,,gameshogun.info,False,0,False,False,2iek,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2iek/newsvine_launching_tomorrow/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,newsvine launching tomorrow!,http://gameshogun.info/index.php/Tech/2006/03/01/newsvine_launching_tomorrow,all_ads
True,FaeLLe,,,,List(),,,text,True,True,False,1141172277,,faelle.com,False,0,False,False,2ies,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2ies/voodoopc_to_launch_8tb_media_pc/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,VoodooPC to launch 8TB Media PC,http://www.FaeLLe.com/2006/03/voodoopc-plans-8tb-media-pc.html,all_ads
True,Megasphaera,,,,List(),,,text,True,True,False,1141172696,,request.reddit.com,False,0,False,False,2if8,True,False,False,False,,List(),,dark,text,False,,True,1,0,False,all_ads,/r/reddit.com/comments/2if8/kafka_immigration/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,kafka immigration,http://request.reddit.com/goto?id=2i9k,all_ads
True,alsaad,,,,List(),,,text,False,True,False,1141173165,,pandora.com,False,0,False,False,2ig1,False,False,False,False,,List(),,dark,text,False,,True,0,0,False,,/r/pl/comments/2ig1/pandoracom_wybieraj_oceniaj_i_sluchaj_tego_co/,,,,markdown,4,,,True,False,False,pl,t5_2475,r/pl,restricted,,default,,,"Pandora.com - wybieraj, oceniaj i sluchaj tego co lubisz. Za free.",http://pandora.com/,
True,Megasphaera,,,,List(),,,text,True,True,False,1141173275,,rxpgnews.com,False,0,False,False,2ig8,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2ig8/meditation_changes_brain_structure/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,meditation changes brain structure,http://www.rxpgnews.com/research/neurosciences/article_2837.shtml,all_ads
True,benm,,,,List(),,,text,True,True,False,1141173366,,blogs.pragprog.com,False,0,False,False,2igf,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/programming/comments/2igf/annotate_models_plugin_for_rails/,,,,markdown,7,,,True,False,False,programming,t5_2fwo,r/programming,public,,default,,,Annotate Models Plugin for Rails,http://blogs.pragprog.com/cgi-bin/pragdave.cgi/Tech/Ruby/AnnotateModels.rdoc,all_ads
True,johnny_yuma,,,,List(),,,text,True,True,False,1141173368,,cbsnews.com,False,0,False,False,2igg,True,False,False,False,,List(),,dark,text,False,,False,1,0,False,all_ads,/r/reddit.com/comments/2igg/kids_build_soybeanfueled_car/,,,,markdown,9,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Kids Build Soybean-Fueled Car,http://www.cbsnews.com/stories/2006/02/17/eveningnews/main1329941.shtml,all_ads


In [5]:
len(df_train.columns)

In [6]:
columns_to_drop = ['archived', 'author_cakeday', 'author_flair_text_color', 'author_flair_background_color', 'author_flair_css_class', \
                   'author_flair_richtext', 'author_flair_text', 'author_flair_type', 'contest_mode', 'edited', 'gilded', 'hidden', 'hide_score', \
                   'is_reddit_media_domain', 'is_self', 'is_video', 'distinguished', 'link_flair_css_class', 'link_flair_richtext', \
                   'link_flair_text', 'post_hint', 'link_flair_text_color', 'link_flair_type', 'locked', 'media', 'rte_mode', 'preview', \
                   'retrieved_on', 'secure_media', 'selftext', 'send_replies', 'spoiler', 'stickied', 'thumbnail', 'thumbnail_height', \
                   'thumbnail_width', 'whitelist_status', 'num_crossposts', 'suggested_sort']

df_train = df_train.drop(*columns_to_drop)

In [7]:
display(df_train)

author,brand_safe,can_gild,created_utc,domain,id,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,permalink,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,title,url
codepoet,True,True,1141171234,macgeekery.com,2icw,True,True,0,False,all_ads,/r/reddit.com/comments/2icw/well_that_was_a_bust/,0,reddit.com,t5_6,r/reddit.com,archived,Well That Was a Bust,http://www.macgeekery.com/opinion/well_that_was_a_bust
scylla,True,True,1141171723,msnbc.msn.com,2idn,True,True,0,False,all_ads,/r/reddit.com/comments/2idn/holocaust_why_david_irving_shouldnt_be_jailed_and/,0,reddit.com,t5_6,r/reddit.com,archived,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right ),http://www.msnbc.msn.com/id/11569497/site/newsweek/
tilto,True,True,1141171939,iht.com,2ie4,True,True,0,False,all_ads,/r/reddit.com/comments/2ie4/google_shares_fall_sharply_as_cfo_announces/,0,reddit.com,t5_6,r/reddit.com,archived,Google shares fall sharply as CFO announces growth is slowing,http://www.iht.com/articles/2006/02/28/business/google.php
Laibcoms,True,True,1141172196,gameshogun.info,2iek,True,True,0,False,all_ads,/r/reddit.com/comments/2iek/newsvine_launching_tomorrow/,0,reddit.com,t5_6,r/reddit.com,archived,newsvine launching tomorrow!,http://gameshogun.info/index.php/Tech/2006/03/01/newsvine_launching_tomorrow
FaeLLe,True,True,1141172277,faelle.com,2ies,True,True,0,False,all_ads,/r/reddit.com/comments/2ies/voodoopc_to_launch_8tb_media_pc/,0,reddit.com,t5_6,r/reddit.com,archived,VoodooPC to launch 8TB Media PC,http://www.FaeLLe.com/2006/03/voodoopc-plans-8tb-media-pc.html
Megasphaera,True,True,1141172696,request.reddit.com,2if8,True,True,1,False,all_ads,/r/reddit.com/comments/2if8/kafka_immigration/,0,reddit.com,t5_6,r/reddit.com,archived,kafka immigration,http://request.reddit.com/goto?id=2i9k
alsaad,False,True,1141173165,pandora.com,2ig1,False,True,0,False,,/r/pl/comments/2ig1/pandoracom_wybieraj_oceniaj_i_sluchaj_tego_co/,4,pl,t5_2475,r/pl,restricted,"Pandora.com - wybieraj, oceniaj i sluchaj tego co lubisz. Za free.",http://pandora.com/
Megasphaera,True,True,1141173275,rxpgnews.com,2ig8,True,True,0,False,all_ads,/r/reddit.com/comments/2ig8/meditation_changes_brain_structure/,0,reddit.com,t5_6,r/reddit.com,archived,meditation changes brain structure,http://www.rxpgnews.com/research/neurosciences/article_2837.shtml
benm,True,True,1141173366,blogs.pragprog.com,2igf,True,True,0,False,all_ads,/r/programming/comments/2igf/annotate_models_plugin_for_rails/,7,programming,t5_2fwo,r/programming,public,Annotate Models Plugin for Rails,http://blogs.pragprog.com/cgi-bin/pragdave.cgi/Tech/Ruby/AnnotateModels.rdoc
johnny_yuma,True,True,1141173368,cbsnews.com,2igg,True,False,1,False,all_ads,/r/reddit.com/comments/2igg/kids_build_soybeanfueled_car/,9,reddit.com,t5_6,r/reddit.com,archived,Kids Build Soybean-Fueled Car,http://www.cbsnews.com/stories/2006/02/17/eveningnews/main1329941.shtml


In [8]:
len(df_train.columns)

In [9]:
# File location and type
file_location = "/FileStore/tables/RS_v2_2006_04-1"
file_type = "json"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_test = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_test)

archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,contest_mode,created_utc,distinguished,domain,edited,gilded,hidden,hide_score,id,is_crosspostable,is_reddit_media_domain,is_self,is_video,link_flair_css_class,link_flair_richtext,link_flair_text,link_flair_text_color,link_flair_type,locked,media,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,post_hint,preview,retrieved_on,rte_mode,score,secure_media,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status
True,spif,,,,List(),,,text,True,True,False,1143849794,,democrats.reform.house.gov,False,0,False,False,3rip,True,False,False,False,,List(),,dark,text,False,,False,0,0,False,all_ads,/r/reddit.com/comments/3rip/iraq_on_the_record_a_searchable_collection_of_237/,,,,markdown,4,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Iraq on the Record: A Searchable Collection Of 237 Specific Misleading Statements Made By Bush Administration Officials About The Threat Posed By Iraq,http://democrats.reform.house.gov/IraqOnTheRecord/,all_ads
True,jjzak,,,,List(),,,text,False,True,False,1143849822,,rubricks.org,False,0,False,False,3rir,True,False,False,False,,List(),,dark,text,False,,False,0,0,False,,/r/ja/comments/3rir/rubricks_cms_on_rails/,,,,markdown,3,,,True,False,False,ja,t5_22i6,r/ja,public,new,default,,,Rubricks - CMS on Rails,http://rubricks.org/,
True,toddieg,,,,List(),,,text,True,True,False,1143849849,,randomcraponline.com,False,0,False,False,3ris,False,False,False,False,,List(),,dark,text,False,,True,1,0,False,all_ads,/r/reddit.com/comments/3ris/islamic_dance_party/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Islamic Dance Party,http://www.randomcraponline.com/islamic-dance-party/,all_ads
True,rmc,,,,List(),,,text,True,True,False,1143849910,,debian-administration.org,False,0,False,False,3riv,True,False,False,False,,List(),,dark,text,False,,False,1,0,False,all_ads,/r/reddit.com/comments/3riv/how_to_make_bash_complete_differently_based_on/,,,,markdown,3,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,How to make bash complete differently based on the command.,http://www.debian-administration.org/articles/317,all_ads
True,toddieg,,,,List(),,,text,False,True,False,1143850224,,randomcraponline.com,False,0,False,False,3rj0,False,False,False,False,,List(),,dark,text,False,,True,0,0,True,promo_adult_nsfw,/r/nsfw/comments/3rj0/hot_web_cam_chicks/,,,,markdown,0,,,True,False,False,nsfw,t5_vf2,r/nsfw,public,,default,,,Hot Web Cam Chicks,http://www.randomcraponline.com/web-cam-chicks/?bikini-thong/,promo_adult_nsfw
True,tiagocardoso,,,,List(),,,text,True,True,False,1143850300,,mainada.net,False,0,False,False,3rj3,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/3rj3/manga_style_other_sketches_see_the_artist_creation/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Manga Style & other sketches - see the artist creation,http://www.mainada.net/comics/index/mangastyle,all_ads
True,neotrantor,,,,List(),,,text,True,True,False,1143850524,,blog.cognitivelabs.com,False,0,False,False,3rj5,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/3rj5/scent_of_fear_improves_cognitive_performance_in/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Scent of Fear Improves Cognitive Performance in Women,http://blog.cognitivelabs.com/2006/03/scent-of-fear-improves-cognitive.html,all_ads
True,honekaimedia,,,,List(),,,text,True,True,False,1143850569,,techeblog.com,False,0,False,False,3rj6,True,False,False,False,,List(),,dark,text,False,,True,1,0,False,all_ads,/r/reddit.com/comments/3rj6/top_10_strangest_cell_phones/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Top 10 Strangest Cell Phones,http://www.techeblog.com/index.php/tech-gadget/top-10-strangest-cell-phones,all_ads
True,eglobe1,,,,List(),,,text,True,True,False,1143850981,,eglobe1.com,False,0,False,False,3rja,False,False,False,False,,List(),,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/3rja/ten_crazy_usb/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,ten crazy usb,http://www.eglobe1.com/index.php/2006/03/31/10-crazy-usb/,all_ads
True,otsune,,,,List(),,,text,False,True,False,1143851173,,bsddiary.net,False,0,False,False,3rjc,True,False,False,False,,List(),,dark,text,False,,True,0,0,False,,/r/ja/comments/3rjc/bind9_設定/,,,,markdown,3,,,True,False,False,ja,t5_22i6,r/ja,public,new,default,,,bind9 設定,http://www.bsddiary.net/doc/bind9.html,


In [10]:
len(df_test.columns)

In [11]:
# df.drop('author_cakeday').collect()

columns_to_drop = ['archived', 'author_cakeday', 'author_flair_text_color', 'author_flair_background_color', 'author_flair_css_class', \
                   'author_flair_richtext', 'author_flair_text', 'author_flair_type', 'contest_mode', 'edited', 'gilded', 'hidden', 'hide_score', \
                   'is_reddit_media_domain', 'is_self', 'is_video', 'distinguished', 'link_flair_css_class', 'link_flair_richtext', \
                   'link_flair_text', 'post_hint', 'link_flair_text_color', 'link_flair_type', 'locked', 'media', 'rte_mode', 'preview', \
                   'retrieved_on', 'secure_media', 'selftext', 'send_replies', 'spoiler', 'stickied', 'thumbnail', 'thumbnail_height', \
                   'thumbnail_width', 'whitelist_status', 'num_crossposts', 'suggested_sort']

df_test = df_test.drop(*columns_to_drop)

In [12]:
display(df_test)

author,brand_safe,can_gild,created_utc,domain,id,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,permalink,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,title,url
spif,True,True,1143849794,democrats.reform.house.gov,3rip,True,False,0,False,all_ads,/r/reddit.com/comments/3rip/iraq_on_the_record_a_searchable_collection_of_237/,4,reddit.com,t5_6,r/reddit.com,archived,Iraq on the Record: A Searchable Collection Of 237 Specific Misleading Statements Made By Bush Administration Officials About The Threat Posed By Iraq,http://democrats.reform.house.gov/IraqOnTheRecord/
jjzak,False,True,1143849822,rubricks.org,3rir,True,False,0,False,,/r/ja/comments/3rir/rubricks_cms_on_rails/,3,ja,t5_22i6,r/ja,public,Rubricks - CMS on Rails,http://rubricks.org/
toddieg,True,True,1143849849,randomcraponline.com,3ris,False,True,1,False,all_ads,/r/reddit.com/comments/3ris/islamic_dance_party/,0,reddit.com,t5_6,r/reddit.com,archived,Islamic Dance Party,http://www.randomcraponline.com/islamic-dance-party/
rmc,True,True,1143849910,debian-administration.org,3riv,True,False,1,False,all_ads,/r/reddit.com/comments/3riv/how_to_make_bash_complete_differently_based_on/,3,reddit.com,t5_6,r/reddit.com,archived,How to make bash complete differently based on the command.,http://www.debian-administration.org/articles/317
toddieg,False,True,1143850224,randomcraponline.com,3rj0,False,True,0,True,promo_adult_nsfw,/r/nsfw/comments/3rj0/hot_web_cam_chicks/,0,nsfw,t5_vf2,r/nsfw,public,Hot Web Cam Chicks,http://www.randomcraponline.com/web-cam-chicks/?bikini-thong/
tiagocardoso,True,True,1143850300,mainada.net,3rj3,True,True,0,False,all_ads,/r/reddit.com/comments/3rj3/manga_style_other_sketches_see_the_artist_creation/,0,reddit.com,t5_6,r/reddit.com,archived,Manga Style & other sketches - see the artist creation,http://www.mainada.net/comics/index/mangastyle
neotrantor,True,True,1143850524,blog.cognitivelabs.com,3rj5,True,True,0,False,all_ads,/r/reddit.com/comments/3rj5/scent_of_fear_improves_cognitive_performance_in/,0,reddit.com,t5_6,r/reddit.com,archived,Scent of Fear Improves Cognitive Performance in Women,http://blog.cognitivelabs.com/2006/03/scent-of-fear-improves-cognitive.html
honekaimedia,True,True,1143850569,techeblog.com,3rj6,True,True,1,False,all_ads,/r/reddit.com/comments/3rj6/top_10_strangest_cell_phones/,0,reddit.com,t5_6,r/reddit.com,archived,Top 10 Strangest Cell Phones,http://www.techeblog.com/index.php/tech-gadget/top-10-strangest-cell-phones
eglobe1,True,True,1143850981,eglobe1.com,3rja,False,True,0,False,all_ads,/r/reddit.com/comments/3rja/ten_crazy_usb/,0,reddit.com,t5_6,r/reddit.com,archived,ten crazy usb,http://www.eglobe1.com/index.php/2006/03/31/10-crazy-usb/
otsune,False,True,1143851173,bsddiary.net,3rjc,True,True,0,False,,/r/ja/comments/3rjc/bind9_設定/,3,ja,t5_22i6,r/ja,public,bind9 設定,http://www.bsddiary.net/doc/bind9.html


In [13]:
len(df_test.columns)

In [14]:
df_train = df_train.fillna( {'parent_whitelist_status':'no_status'} )
df_test = df_test.fillna( {'parent_whitelist_status':'no_status'} )

In [15]:
display(df_train.select([count(when(col(c).isNull(), c)).alias(c) for c in df_train.columns]))

author,brand_safe,can_gild,created_utc,domain,id,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,permalink,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,title,url
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
display(df_test.select([count(when(col(c).isNull(), c)).alias(c) for c in df_test.columns]))

author,brand_safe,can_gild,created_utc,domain,id,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,permalink,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,title,url
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
columns_to_drop2 = ['created_utc', 'id', 'permalink', 'subreddit_id', 'subreddit_name_prefixed', 'url']

df_train = df_train.drop(*columns_to_drop2)

In [18]:
len(df_train.columns)

In [19]:
columns_to_drop2 = ['created_utc', 'id', 'permalink', 'subreddit_id', 'subreddit_name_prefixed', 'url']

df_test = df_test.drop(*columns_to_drop2)

In [20]:
len(df_test.columns)

In [21]:
display(df_train)

author,brand_safe,can_gild,domain,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,score,subreddit,subreddit_type,title
codepoet,True,True,macgeekery.com,True,True,0,False,all_ads,0,reddit.com,archived,Well That Was a Bust
scylla,True,True,msnbc.msn.com,True,True,0,False,all_ads,0,reddit.com,archived,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right )
tilto,True,True,iht.com,True,True,0,False,all_ads,0,reddit.com,archived,Google shares fall sharply as CFO announces growth is slowing
Laibcoms,True,True,gameshogun.info,True,True,0,False,all_ads,0,reddit.com,archived,newsvine launching tomorrow!
FaeLLe,True,True,faelle.com,True,True,0,False,all_ads,0,reddit.com,archived,VoodooPC to launch 8TB Media PC
Megasphaera,True,True,request.reddit.com,True,True,1,False,all_ads,0,reddit.com,archived,kafka immigration
alsaad,False,True,pandora.com,False,True,0,False,no_status,4,pl,restricted,"Pandora.com - wybieraj, oceniaj i sluchaj tego co lubisz. Za free."
Megasphaera,True,True,rxpgnews.com,True,True,0,False,all_ads,0,reddit.com,archived,meditation changes brain structure
benm,True,True,blogs.pragprog.com,True,True,0,False,all_ads,7,programming,public,Annotate Models Plugin for Rails
johnny_yuma,True,True,cbsnews.com,True,False,1,False,all_ads,9,reddit.com,archived,Kids Build Soybean-Fueled Car


In [22]:
df_train_p  = df_train.toPandas()

In [23]:
from pyspark.ml.feature import RegexTokenizer

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'domain', outputCol = 'domain_tok')

df_train = regexTokenizer.transform(df_train)
display(df_train)

author,brand_safe,can_gild,domain,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,score,subreddit,subreddit_type,title,domain_tok
codepoet,True,True,macgeekery.com,True,True,0,False,all_ads,0,reddit.com,archived,Well That Was a Bust,"List(macgeekery, com)"
scylla,True,True,msnbc.msn.com,True,True,0,False,all_ads,0,reddit.com,archived,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right ),"List(msnbc, msn, com)"
tilto,True,True,iht.com,True,True,0,False,all_ads,0,reddit.com,archived,Google shares fall sharply as CFO announces growth is slowing,"List(iht, com)"
Laibcoms,True,True,gameshogun.info,True,True,0,False,all_ads,0,reddit.com,archived,newsvine launching tomorrow!,"List(gameshogun, info)"
FaeLLe,True,True,faelle.com,True,True,0,False,all_ads,0,reddit.com,archived,VoodooPC to launch 8TB Media PC,"List(faelle, com)"
Megasphaera,True,True,request.reddit.com,True,True,1,False,all_ads,0,reddit.com,archived,kafka immigration,"List(request, reddit, com)"
alsaad,False,True,pandora.com,False,True,0,False,no_status,4,pl,restricted,"Pandora.com - wybieraj, oceniaj i sluchaj tego co lubisz. Za free.","List(pandora, com)"
Megasphaera,True,True,rxpgnews.com,True,True,0,False,all_ads,0,reddit.com,archived,meditation changes brain structure,"List(rxpgnews, com)"
benm,True,True,blogs.pragprog.com,True,True,0,False,all_ads,7,programming,public,Annotate Models Plugin for Rails,"List(blogs, pragprog, com)"
johnny_yuma,True,True,cbsnews.com,True,False,1,False,all_ads,9,reddit.com,archived,Kids Build Soybean-Fueled Car,"List(cbsnews, com)"


In [24]:
# remove stopwords
from pyspark.ml.feature import StopWordsRemover
swr = StopWordsRemover(inputCol = 'domain_tok', outputCol = 'domain_tok_sw')

df_train = swr.transform(df_train)
display(df_train)
# reviews_swr.write.saveAsTable('reviews_swr', mode = 'overwrite')

author,brand_safe,can_gild,domain,is_crosspostable,no_follow,num_comments,over_18,parent_whitelist_status,score,subreddit,subreddit_type,title,domain_tok,domain_tok_sw
codepoet,True,True,macgeekery.com,True,True,0,False,all_ads,0,reddit.com,archived,Well That Was a Bust,"List(macgeekery, com)","List(macgeekery, com)"
scylla,True,True,msnbc.msn.com,True,True,0,False,all_ads,0,reddit.com,archived,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right ),"List(msnbc, msn, com)","List(msnbc, msn, com)"
tilto,True,True,iht.com,True,True,0,False,all_ads,0,reddit.com,archived,Google shares fall sharply as CFO announces growth is slowing,"List(iht, com)","List(iht, com)"
Laibcoms,True,True,gameshogun.info,True,True,0,False,all_ads,0,reddit.com,archived,newsvine launching tomorrow!,"List(gameshogun, info)","List(gameshogun, info)"
FaeLLe,True,True,faelle.com,True,True,0,False,all_ads,0,reddit.com,archived,VoodooPC to launch 8TB Media PC,"List(faelle, com)","List(faelle, com)"
Megasphaera,True,True,request.reddit.com,True,True,1,False,all_ads,0,reddit.com,archived,kafka immigration,"List(request, reddit, com)","List(request, reddit, com)"
alsaad,False,True,pandora.com,False,True,0,False,no_status,4,pl,restricted,"Pandora.com - wybieraj, oceniaj i sluchaj tego co lubisz. Za free.","List(pandora, com)","List(pandora, com)"
Megasphaera,True,True,rxpgnews.com,True,True,0,False,all_ads,0,reddit.com,archived,meditation changes brain structure,"List(rxpgnews, com)","List(rxpgnews, com)"
benm,True,True,blogs.pragprog.com,True,True,0,False,all_ads,7,programming,public,Annotate Models Plugin for Rails,"List(blogs, pragprog, com)","List(blogs, pragprog, com)"
johnny_yuma,True,True,cbsnews.com,True,False,1,False,all_ads,9,reddit.com,archived,Kids Build Soybean-Fueled Car,"List(cbsnews, com)","List(cbsnews, com)"


In [25]:
from pyspark.ml.feature import Word2Vec

#create an average word vector for each document (works well according to Zeyu & Shu)
word2vec = Word2Vec(vectorSize = 10, minCount = 5, inputCol = 'domain_tok_sw', outputCol = 'domain_tok_sw_w2v')
model = word2vec.fit(df_train)
df_train = model.transform(df_train)

# display(result)
df_train.show(1, truncate = True)

In [26]:
regexTokenizer2 = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'title', outputCol = 'title_tok')

df_train = regexTokenizer2.transform(df_train)

swr2 = StopWordsRemover(inputCol = 'title_tok', outputCol = 'title_tok_sw')

df_train = swr2.transform(df_train)

word2vec2 = Word2Vec(vectorSize = 10, minCount = 5, inputCol = 'title_tok_sw', outputCol = 'title_tok_sw_w2v')
model2 = word2vec2.fit(df_train)
df_train = model2.transform(df_train)

df_train.show(1)

In [27]:
regexTokenizer_t = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'title', outputCol = 'title_tok')

df_test = regexTokenizer_t.transform(df_test)

swr_t = StopWordsRemover(inputCol = 'title_tok', outputCol = 'title_tok_sw')

df_test = swr_t.transform(df_test)

word2vec_t = Word2Vec(vectorSize = 10, minCount = 5, inputCol = 'title_tok_sw', outputCol = 'title_tok_sw_w2v')
model_t = word2vec_t.fit(df_test)
df_test = model_t.transform(df_test)

df_test.show(1)

In [28]:
regexTokenizer_t2 = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'domain', outputCol = 'domain_tok')

df_test = regexTokenizer_t2.transform(df_test)

swr_t2 = StopWordsRemover(inputCol = 'domain_tok', outputCol = 'domain_tok_sw')

df_test = swr_t2.transform(df_test)

word2vec_t2 = Word2Vec(vectorSize = 10, minCount = 5, inputCol = 'domain_tok_sw', outputCol = 'domain_tok_sw_w2v')
model_t2 = word2vec_t2.fit(df_test)
df_test = model_t2.transform(df_test)

df_test.show(1)

In [29]:
df_train.columns

In [30]:
columns_to_drop = ['author', 'domain', 'title', 'domain_tok', 'domain_tok_sw', 'title_tok', 'title_tok_sw']

df_train = df_train.drop(*columns_to_drop)
df_test = df_test.drop(*columns_to_drop)

In [31]:
len(df_train.columns), len(df_test.columns)

In [32]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['brand_safe', 'can_gild', 'is_crosspostable', 'no_follow', 'over_18', \
                                     'domain_tok_sw_w2v', 'title_tok_sw_w2v'], outputCol = 'features')
df_t = vectorAssembler.transform(df_train)
df_t = df_t.select(['features', 'score'])
df_t.show(3)

In [33]:
vectorAssembler_t = VectorAssembler(inputCols = ['brand_safe', 'can_gild', 'is_crosspostable', 'no_follow', 'over_18', \
                                     'domain_tok_sw_w2v', 'title_tok_sw_w2v'], outputCol = 'features')
df_t2 = vectorAssembler_t.transform(df_test)
df_t2 = df_t2.select(['features', 'score'])
df_t2.show(3)

In [34]:
display(df_t)

features,score
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.015800124034285545, 0.02123221382498741, -0.0022580502554774284, -0.0097417663782835, -0.005755799822509289, -0.004218887072056532, -0.0067719086073338985, -0.01841377653181553, -0.01630293019115925, 0.018032256513834))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2198091975102822, -0.5566953718662262, -0.37900424997011817, 0.1516778568426768, 0.31289644042650855, 0.08059688284993172, 0.19617316623528797, -0.19998193780581155, 0.05908615266283353, 0.29586882392565406, -0.005773128675562994, -0.012588509391727192, -0.020303547249308652, 0.005543550609477929, 0.019453893615198985, 0.00971975816147668, -0.011488526003501776, -0.01310856786689588, -0.002863977603348238, -0.004415305863533701))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2916935980319977, -0.6049031801521778, -0.42878722690511495, -0.0017236322164535522, 0.3289696762803942, 0.21810894459486008, 0.12646116316318512, -0.40581999509595335, -0.0017805993556976318, 0.4422566844150424, 0.02898435399401933, 0.010768831241875887, -0.0532741661882028, 0.007837481563910842, 0.019248876080382615, 0.03130364022217691, -0.0042136122938245535, 0.008642591070383787, 0.0025228934828191996, 0.005627083010040224))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.06950278207659721, -0.09616354992613196, -0.2503345283330418, -0.2301531843841076, -0.0982899647206068, 0.054081156849861145, -0.13172189984470606, 0.06605144450441003, 0.14204961759969592, -0.02052687481045723, 0.010734148944417635, -0.0021840371191501617, -0.024106156080961227, 0.00799249392002821, 0.027912658949693043, -0.016466011914114155, 0.024089742451906204, -0.02044922610123952, 0.004474412960310777, 0.020337664832671482))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.004096581041812897, -0.010167287569493056, -0.01658760318532586, 0.011622516810894013, 0.01980698322877288, 0.006437075883150101, -0.004793687164783478, 0.01516856998205185, -0.016549307480454446, -0.0012607873417437077))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2870828260978063, -0.49103987713654834, -0.38307138284047443, 0.012741322939594586, 0.4146902107944091, 0.2853822310765584, 0.12415572566290696, -0.3079897020943463, -0.06019148230552673, 0.41112400839726126, 0.004198136273771524, -0.012488183565437794, -0.03997119516134262, 0.010088421404361725, 0.023168547078967094, -0.011859181337058544, 0.008104964159429073, 3.9321655640378594E-4, -0.012698360718786716, 0.02314193919301033))",0
"List(1, 25, List(), List(0.0, 1.0, 0.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, -0.011909103766083718, 0.003631064482033253, -0.005362836667336524, 0.025806966796517374, 0.029012725688517094, 0.004187270998954773, 0.003417332749813795, 0.009690045076422394, -0.0020867966115474704, -6.760217249393464E-4))",4
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.035641176626086235, 0.001686709700152278, -0.033007376827299595, 0.012626323034055531, 0.024362491676583886, -0.012495958944782615, 0.019322329200804234, 0.034846687223762274, 0.015529385767877102, 0.011660400894470513))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.39519235491752625, -0.5173403024673462, -0.2656558404366175, -0.031718204418818154, 0.4818037152290344, 0.32045236229896545, 0.1761237730582555, -0.4187755386034647, -0.03577855726083119, 0.5317473014195759, 0.015937167685478926, 0.0029655606485903263, -0.05199255677871406, 0.044233604334294796, 0.03863402991555631, 0.006631079129874706, -0.022522698040120304, -0.004717825911939144, 0.010818022768944502, -0.01006298791617155))",7
"List(1, 25, List(), List(1.0, 1.0, 1.0, 0.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.008500193152576685, 0.010831071063876152, -0.044429519400000575, 0.034107455238699916, 0.030298941209912302, -0.0010508798062801362, -0.013425495103001596, 0.01524256318807602, -4.253140650689602E-5, 0.025596008449792863))",9


In [35]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='score', maxIter=1000, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(df_t)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [36]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [37]:
lr_predictions = lr_model.transform(df_t2)
lr_predictions.select("prediction","score","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="score",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [38]:
test_result = lr_model.evaluate(df_t2)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

In [39]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

In [40]:
display(df_t)

features,score
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.015800124034285545, 0.02123221382498741, -0.0022580502554774284, -0.0097417663782835, -0.005755799822509289, -0.004218887072056532, -0.0067719086073338985, -0.01841377653181553, -0.01630293019115925, 0.018032256513834))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2198091975102822, -0.5566953718662262, -0.37900424997011817, 0.1516778568426768, 0.31289644042650855, 0.08059688284993172, 0.19617316623528797, -0.19998193780581155, 0.05908615266283353, 0.29586882392565406, -0.005773128675562994, -0.012588509391727192, -0.020303547249308652, 0.005543550609477929, 0.019453893615198985, 0.00971975816147668, -0.011488526003501776, -0.01310856786689588, -0.002863977603348238, -0.004415305863533701))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2916935980319977, -0.6049031801521778, -0.42878722690511495, -0.0017236322164535522, 0.3289696762803942, 0.21810894459486008, 0.12646116316318512, -0.40581999509595335, -0.0017805993556976318, 0.4422566844150424, 0.02898435399401933, 0.010768831241875887, -0.0532741661882028, 0.007837481563910842, 0.019248876080382615, 0.03130364022217691, -0.0042136122938245535, 0.008642591070383787, 0.0025228934828191996, 0.005627083010040224))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.06950278207659721, -0.09616354992613196, -0.2503345283330418, -0.2301531843841076, -0.0982899647206068, 0.054081156849861145, -0.13172189984470606, 0.06605144450441003, 0.14204961759969592, -0.02052687481045723, 0.010734148944417635, -0.0021840371191501617, -0.024106156080961227, 0.00799249392002821, 0.027912658949693043, -0.016466011914114155, 0.024089742451906204, -0.02044922610123952, 0.004474412960310777, 0.020337664832671482))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.004096581041812897, -0.010167287569493056, -0.01658760318532586, 0.011622516810894013, 0.01980698322877288, 0.006437075883150101, -0.004793687164783478, 0.01516856998205185, -0.016549307480454446, -0.0012607873417437077))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.2870828260978063, -0.49103987713654834, -0.38307138284047443, 0.012741322939594586, 0.4146902107944091, 0.2853822310765584, 0.12415572566290696, -0.3079897020943463, -0.06019148230552673, 0.41112400839726126, 0.004198136273771524, -0.012488183565437794, -0.03997119516134262, 0.010088421404361725, 0.023168547078967094, -0.011859181337058544, 0.008104964159429073, 3.9321655640378594E-4, -0.012698360718786716, 0.02314193919301033))",0
"List(1, 25, List(), List(0.0, 1.0, 0.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, -0.011909103766083718, 0.003631064482033253, -0.005362836667336524, 0.025806966796517374, 0.029012725688517094, 0.004187270998954773, 0.003417332749813795, 0.009690045076422394, -0.0020867966115474704, -6.760217249393464E-4))",4
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.035641176626086235, 0.001686709700152278, -0.033007376827299595, 0.012626323034055531, 0.024362491676583886, -0.012495958944782615, 0.019322329200804234, 0.034846687223762274, 0.015529385767877102, 0.011660400894470513))",0
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.39519235491752625, -0.5173403024673462, -0.2656558404366175, -0.031718204418818154, 0.4818037152290344, 0.32045236229896545, 0.1761237730582555, -0.4187755386034647, -0.03577855726083119, 0.5317473014195759, 0.015937167685478926, 0.0029655606485903263, -0.05199255677871406, 0.044233604334294796, 0.03863402991555631, 0.006631079129874706, -0.022522698040120304, -0.004717825911939144, 0.010818022768944502, -0.01006298791617155))",7
"List(1, 25, List(), List(1.0, 1.0, 1.0, 0.0, 0.0, 0.3544502854347229, -0.6449003219604492, -0.4270171523094177, -0.1300344318151474, 0.3328089118003845, 0.1513465791940689, 0.051165349781513214, -0.40864062309265137, 0.07422507554292679, 0.45328983664512634, 0.008500193152576685, 0.010831071063876152, -0.044429519400000575, 0.034107455238699916, 0.030298941209912302, -0.0010508798062801362, -0.013425495103001596, 0.01524256318807602, -4.253140650689602E-5, 0.025596008449792863))",9


In [41]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="score", featuresCol="features")

rf_model = rf.fit(df_t)

predictions = rf_model.transform(df_t2)

In [42]:
display(predictions)

features,score,prediction
"List(1, 25, List(), List(1.0, 1.0, 1.0, 0.0, 0.0, 0.01163703203201294, 0.018528031185269356, -4.7532934695482254E-4, -0.009292944334447384, 0.0027298512868583202, -0.002496141940355301, 0.018059439957141876, -0.001220962032675743, 0.016040101647377014, -0.0054797520861029625, -0.003396984686454137, 0.003152846141407887, -0.0321441267306606, 0.004761212412267924, 0.039769863872788844, -0.001328083345045646, -0.036600311286747456, 0.026752047209690013, 0.0012804487099250157, 0.02216167648633321))",4,20.08248928226738
"List(1, 25, List(), List(0.0, 1.0, 1.0, 0.0, 0.0, 0.24950194358825684, -0.020930277183651924, -0.18183720111846924, 0.2022489756345749, 0.4538852572441101, 0.5643795728683472, 0.08349420875310898, -0.4040583372116089, 0.01895042695105076, 0.1168494001030922, -0.008581571280956268, -0.020775021364291508, -0.016004205991824467, -0.01982708399494489, 0.009235601251324017, -0.0043544284999370575, -0.013786935557921726, -0.001233439582089583, -0.0038555342083175974, 0.0010204976424574852))",3,15.1222670285436
"List(1, 25, List(), List(1.0, 1.0, 0.0, 1.0, 0.0, 0.21803046762943268, -0.47431802842766047, -0.23690887726843357, -0.3911142647266388, -0.0011392831802368164, -0.18379954807460308, -0.4019920900464058, -0.058268796652555466, 0.030791383236646652, 0.3372248634696007, -0.027211878759165604, -0.021771760036547978, 0.021709407369295754, 0.009049680549651384, -0.010956250752011934, -0.018087375909090042, 0.004524710277716318, -0.0022423543656865754, -0.017218480662753183, -0.006941353591779867))",0,2.3844394549792263
"List(1, 25, List(), List(1.0, 1.0, 1.0, 0.0, 0.0, 0.1663346290588379, -0.013953518122434616, -0.12122480074564615, 0.13483265042304993, 0.30259017149607337, 0.3762530485788981, 0.05566280583540598, -0.26937222480773926, 0.012633617967367172, 0.07789960006872812, -0.011931269119183222, 0.0015300199932729204, -0.013037876846889654, -0.00404514092952013, 0.02305790999283393, 0.008133221107224623, -0.02081382346417134, 0.05468489117144296, -0.018711174217363197, 0.019643416240190465))",3,32.93097371301068
"List(1, 25, List(), List(0.0, 1.0, 0.0, 1.0, 1.0, 0.21803046762943268, -0.47431802842766047, -0.23690887726843357, -0.3911142647266388, -0.0011392831802368164, -0.18379954807460308, -0.4019920900464058, -0.058268796652555466, 0.030791383236646652, 0.3372248634696007, -0.027671108953654766, 0.04154927562922239, -0.006094903685152531, 0.008473582391161472, 0.03164350055158138, -0.009911970002576709, -0.00785964378155768, 0.02851010113954544, -0.04523683758452535, -0.00813448615372181))",0,5.990642618562214
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.0748219033703208, -0.3186250440776348, 0.16949260979890823, 0.7122031012549996, 0.21246505063027143, -0.08737022802233696, 0.05481266789138317, -0.13604910671710968, -0.19225259870290756, 0.4397807791829109, -0.0033313754746424297, 0.0196296785558973, -0.006119988858699799, 0.0072330671495624945, -0.01083363898630653, 0.018903604708611965, -0.014160561375319958, 0.016897618837122406, -0.0018269295022556823, -0.010999935013907296))",0,1.367696013102703
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.11850496381521225, -0.4273155555129051, -0.23479899143179256, -0.20534645020961761, 0.060374402130643524, -0.012763197223345438, -0.3107160006960233, -0.060718162606159844, 0.0441765886110564, 0.26457426076134044, -0.006829228795443972, 0.029722367723782856, -0.035174701983729996, 0.039567273731033005, 0.03718594298698008, 0.002982049365527928, -0.030011071357876062, 0.03070251845444242, -0.004444971952276925, 0.017960386894022424))",0,1.4685143046567335
"List(1, 25, List(), List(1.0, 1.0, 1.0, 1.0, 0.0, 0.15742982923984528, -0.500049801543355, -0.29093921929597855, -0.3333391025662422, -0.15160523355007172, -0.09974243491888046, -0.4233591668307781, 0.04152049869298935, 0.00580219179391861, 0.3258182182908058, -0.03073304444551468, 0.04304679520428181, -0.034380271285772326, 0.009451311454176903, 0.07352487542666496, 0.0042274782434105875, 0.017198189720511438, 0.015386471338570119, -0.03260105603840202, 0.00834011440165341))",0,1.779465576889672
"List(1, 25, List(), List(1.0, 1.0, 0.0, 1.0, 0.0, 0.25432897731661797, -0.4877329566515982, -0.23867657547816634, -0.38774542324244976, 0.0673304945230484, -0.18674437887966633, -0.3831180278211832, -0.06379805505275726, 0.0461353175342083, 0.36662232875823975, -0.05523281854887803, -0.0035656384813288846, -0.028380523125330605, 0.016754571658869583, 0.06660199561156332, -0.0036968531397481756, -0.02491423084090153, 0.07857262839873631, -0.004808882251381874, 0.05366195614139239))",0,1.290277754203213
"List(0, 25, List(1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), List(1.0, 1.0, 1.0, 0.06500685214996338, -0.341022253036499, 0.04110928624868393, 0.6993443965911865, 0.2083054482936859, 0.035665739327669144, 0.06798604130744934, -0.12820866703987122, -0.15845443308353424, 0.5225892066955566))",3,2.6797883123007464


In [43]:
import mlflow
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[rf])

from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import ParamGridBuilder
import numpy as np

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 25, num = 3)]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

cvModel = crossval.fit(df_t)

predictions = cvModel.transform(df_t2)

In [44]:
import matplotlib.pyplot as plt

evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

rfPred = cvModel.transform(df)

rfResult = rfPred.toPandas()

plt.plot(rfResult.label, rfResult.prediction, 'bo')
plt.xlabel('Price')
plt.ylabel('Prediction')
plt.suptitle("Model Performance RMSE: %f" % rmse)
plt.show()

In [45]:
display(df_train)

In [46]:
import seaborn as sns

sns.boxplot("score", data=df_train_p[df_train_p["score"]<=200])

In [47]:
display(df_train.select("*").where(df_train.score <= 200))

In [48]:
df_train_num = df_train.select("brand_safe", "can_gild", "is_crosspostable", "num_comments", "no_follow", "over_18",  "score")
display(df_train_num)

In [49]:
display(features.collect())

In [50]:
features = df_train_num.rdd.map(lambda row: row[0:])

from pyspark.mllib.stat import Statistics

corr_mat=Statistics.corr(features, method="pearson")

In [51]:
print(corr_mat)

In [52]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# convert to vector column first
vector_col = "score"
assembler = VectorAssembler(inputCols=df_train_num.columns, outputCol=vector_col)
df_vector = assembler.transform(df_train_num).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(df_vector, vector_col)

In [53]:
import numpy as np
import matplotlib.pyplot as plt

In [54]:
var = 'score'
x = df_train[var].values
bins = np.arange(0, 100, 5.0)

plt.figure(figsize=(10,8))
# the histogram of the data
plt.hist(x, bins, alpha=0.8, histtype='bar', color='gold',
         ec='black',weights=np.zeros_like(x) + 100. / x.size)

plt.xlabel(var)
plt.ylabel('percentage')
plt.xticks(bins)
plt.show()

In [55]:
df_train["score"].values

In [56]:
%sql

/* Query the created temp table in a SQL cell */

select * from `RS_v2_2006_03`

In [57]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "RS_v2_2006_03"

# df.write.format("parquet").saveAsTable(permanent_table_name)