In [76]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import re
import string

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Exploratory_Analysis") \
    .config("spark.executor.memory", '8g') \
    .config("spark.executor.cores", '4') \
    .config('spark.cores.max', '4') \
    .config('spark.driver.memory', '8g') \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
beers = spark.read.format('csv'). \
    option("header", "true"). \
    option("inferSchema", "true"). \
    load("/home/aaron/BigData135/datasets/beers.csv")

In [3]:
breweries = spark.read.format('csv'). \
    option("header", "true"). \
    option("inferSchema", "true"). \
    load("/home/aaron/BigData135/datasets/breweries.csv")

In [4]:
reviews = spark.read.format('csv'). \
    option("header", "true"). \
    option("inferSchema", "true"). \
    load("/home/aaron/BigData135/datasets/reviews.csv")

In [5]:
beers.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- brewery_id: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- style: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- abv: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- retired: string (nullable = true)



In [6]:
breweries.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- types: string (nullable = true)



In [7]:
reviews.printSchema()

root
 |-- beer_id: integer (nullable = true)
 |-- username: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- look: string (nullable = true)
 |-- smell: string (nullable = true)
 |-- taste: string (nullable = true)
 |-- feel: string (nullable = true)
 |-- overall: string (nullable = true)
 |-- score: string (nullable = true)



In [9]:
beers.show(5)

+------+--------------------+----------+-----+-------+--------------------+------------+----+--------------------+-------+
|    id|                name|brewery_id|state|country|               style|availability| abv|               notes|retired|
+------+--------------------+----------+-----+-------+--------------------+------------+----+--------------------+-------+
|202522|      Olde Cogitator|      2199|   CA|     US|English Oatmeal S...|    Rotating| 7.3|No notes at this ...|      f|
| 82352|Konrads Stout Rus...|     18604| null|     NO|Russian Imperial ...|    Rotating|10.4|No notes at this ...|      f|
|214879|      Scottish Right|     44306|   IN|     US|        Scottish Ale|  Year-round|   4|No notes at this ...|      t|
|320009|MegaMeow Imperial...|      4378|   WA|     US|American Imperial...|      Winter| 8.7|Every time this year|      f|
|246438|     Peaches-N-Cream|     44617|   PA|     US|  American Cream Ale|    Rotating| 5.1|No notes at this ...|      f|
+------+--------

In [100]:
beers.count()

358873

In [10]:
breweries.show(5)

+-----+--------------------+--------------+-----+-------+--------------------+--------------------+
|   id|                name|          city|state|country|               notes|               types|
+-----+--------------------+--------------+-----+-------+--------------------+--------------------+
|19730|     Brouwerij Danny|     Erpe-Mere| null|     BE|No notes at this ...|             Brewery|
|32541|Coachella Valley ...|Thousand Palms|   CA|     US|No notes at this ...|Brewery, Bar, Bee...|
|44736|    Beef 'O' Brady's|    Plant City|   FL|     US|No notes at this ...|         Bar, Eatery|
|23372|Broadway Wine Mer...| Oklahoma City|   OK|     US|No notes at this ...|               Store|
|35328|Brighton Beer Dis...|      Brighton|  GB2|     GB|Duplicate of http...|         Bar, Eatery|
+-----+--------------------+--------------+-----+-------+--------------------+--------------------+
only showing top 5 rows



In [11]:
reviews.show(5)

+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+
|beer_id|       username|               date|                text|                look|               smell| taste|                feel|          overall|             score|
+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+
| 271781|   bluejacket74|2017-03-17 00:00:00|   750 ml bottle,...|                   4|                   4|     4|                4.25|                4|              4.03|
| 125646|        _dirty_|2017-12-21 00:00:00|                    |                 4.5|                 4.5|   4.5|                 4.5|              4.5|               4.5|
| 125646|        CJDUBYA|2017-12-21 00:00:00|                    |                4.75|                4.75|  4.75|               

In [21]:
(reviews.filter(reviews['text'] != '\xa0\xa0')).count()

2987993

In [30]:
non_empty_reviews = reviews.filter(reviews['text'] != '\xa0\xa0')

In [31]:
non_empty_reviews.show(5)

+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+
|beer_id|       username|               date|                text|                look|               smell| taste|                feel|          overall|             score|
+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+
| 271781|   bluejacket74|2017-03-17 00:00:00|   750 ml bottle,...|                   4|                   4|     4|                4.25|                4|              4.03|
| 125646|GratefulBeerGuy|2017-12-20 00:00:00|"   0% 16 oz can....| bloomin' like a ...| totally unfilter...| thick| all-white clumps...| mellon and mango| grainy earthiness|
| 125646|       LukeGude|2017-12-20 00:00:00|   Classic TH NEI...|                4.25|                 4.5|  4.25|               

In [35]:
non_empty_reviews.dtypes

[('beer_id', 'int'),
 ('username', 'string'),
 ('date', 'timestamp'),
 ('text', 'string'),
 ('look', 'string'),
 ('smell', 'string'),
 ('taste', 'string'),
 ('feel', 'string'),
 ('overall', 'string'),
 ('score', 'string')]

In [50]:
non_empty_reviews.describe().show()

+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|          beer_id|            username|                text|                look|               smell|               taste|                feel|             overall|               score|
+-------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|          2987993|             2984209|             2987993|             2831961|             2830771|             2829717|             2828842|             2828088|             2983385|
|   mean|63296.20292818624|1.8038932242394958E9|                null|  3.9394156210280866|  3.8445426409163534|   3.870134391442983|   3.835104948841218|   3.864983629690377|  3.8468598629637483|
| stddev|76771.34267

In [61]:
non_empty_reviews.select('look','smell','taste','feel','overall','score').show(50)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                look|               smell|               taste|                feel|             overall|               score|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   4|                   4|                   4|                4.25|                   4|                4.03|
| bloomin' like a ...| totally unfilter...|               thick| all-white clumps...|    mellon and mango|   grainy earthiness|
|                4.25|                 4.5|                4.25|                4.25|                4.25|                4.31|
|                4.75|                 4.5|                 4.5|                 4.5|                 4.5|                4.52|
|                 4.5|                 4.5|                 4.5|                4.75|                 4.

In [82]:
non_empty_reviews.filter(F.col("look").cast("int").isNotNull() == False).show(5)

+-------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|beer_id|       username|               date|                text|                look|               smell|               taste|                feel|             overall|               score|
+-------+---------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 125646|GratefulBeerGuy|2017-12-20 00:00:00|"   0% 16 oz can....| bloomin' like a ...| totally unfilter...|               thick| all-white clumps...|    mellon and mango|   grainy earthiness|
| 206623|   rodbeermunch|2016-01-27 00:00:00|"   Dark brown po...| whisps away quic...| possibly the bes...| good irish malt ...| bourbon and oak ...| relatively easy ...| good bourbon del...|
|  96331|       dirtylou|2013-07-09

In [83]:
non_empty_reviews.filter(F.col("look").cast("int").isNotNull() == False).count()

372068

In [84]:
372068/2987993

0.12452104138128837

In [96]:
non_empty_reviews.groupBy('beer_id').count().sort('count', ascending = False).show(10)

+-------+-----+
|beer_id|count|
+-------+-----+
|    645| 4364|
|  11757| 4300|
|   2093| 4252|
|   7971| 4155|
|   1093| 4054|
|    412| 4001|
|  17112| 3905|
|    695| 3786|
|  19960| 3738|
|   1904| 3675|
+-------+-----+
only showing top 10 rows



In [99]:
non_empty_reviews.agg(F.countDistinct("beer_id")).show()

+-----------------------+
|count(DISTINCT beer_id)|
+-----------------------+
|                 210311|
+-----------------------+



In [111]:
beerStyles = beers.select("id","style")

In [112]:
beerStyles.show(5)

+------+--------------------+
|    id|               style|
+------+--------------------+
|202522|English Oatmeal S...|
| 82352|Russian Imperial ...|
|214879|        Scottish Ale|
|320009|American Imperial...|
|246438|  American Cream Ale|
+------+--------------------+
only showing top 5 rows



In [113]:
beerStyles = beerStyles.withColumnRenamed('id', 'beer_id')

In [114]:
beerStyles.show(5)

+-------+--------------------+
|beer_id|               style|
+-------+--------------------+
| 202522|English Oatmeal S...|
|  82352|Russian Imperial ...|
| 214879|        Scottish Ale|
| 320009|American Imperial...|
| 246438|  American Cream Ale|
+-------+--------------------+
only showing top 5 rows



In [119]:
test = non_empty_reviews.join(beerStyles, "beer_id")

In [120]:
test.show(5)

+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+--------------------+
|beer_id|       username|               date|                text|                look|               smell| taste|                feel|          overall|             score|               style|
+-------+---------------+-------------------+--------------------+--------------------+--------------------+------+--------------------+-----------------+------------------+--------------------+
| 271781|   bluejacket74|2017-03-17 00:00:00|   750 ml bottle,...|                   4|                   4|     4|                4.25|                4|              4.03|American Imperial...|
| 125646|GratefulBeerGuy|2017-12-20 00:00:00|"   0% 16 oz can....| bloomin' like a ...| totally unfilter...| thick| all-white clumps...| mellon and mango| grainy earthiness|     New England IPA|
| 125646|       LukeGude|

In [121]:
test.count()

2987925

In [123]:
test.groupBy('style').count().sort('count', ascending = False).show(20)

+--------------------+------+
|               style| count|
+--------------------+------+
|        American IPA|301774|
|American Imperial...|212697|
|American Imperial...|150160|
|American Pale Ale...|126489|
|      Belgian Saison| 91000|
|Russian Imperial ...| 86117|
|     American Porter| 71189|
|   American Wild Ale| 63393|
|American Amber / ...| 62818|
|Fruit and Field Beer| 58342|
|Belgian Strong Da...| 53097|
|     Belgian Witbier| 46545|
|Belgian Strong Pa...| 45732|
|      Belgian Tripel| 45686|
|  American Brown Ale| 44774|
| American Strong Ale| 43575|
|   German Hefeweizen| 42930|
|      American Stout| 41879|
| American Barleywine| 40873|
|American Adjunct ...| 39404|
+--------------------+------+
only showing top 20 rows

