In [2]:
# Install Spark

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
# Setting Environment 

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [4]:
# Installing PySpark

!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/8e/b0/bf9020b56492281b9c9d8aae8f44ff51e1bc91b3ef5a884385cb4e389a40/pyspark-3.0.0.tar.gz (204.7MB)
[K     |████████████████████████████████| 204.7MB 73kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 46.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044184 sha256=27c8feea67239df7c18bbc446b7377748c38354b017daa8e0ba00e8c6b8be403
  Stored in directory: /root/.cache/pip/wheels/57/27/4d/ddacf7143f8d5b76c45c61ee2e43d9f8492fc5a8e78ebd7d37
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.0


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list,udf,lit,explode,split,col,lower,trim,regexp_replace, substring, concat_ws, concat

In [6]:
# Creating Spark Application to run on local

spark = SparkSession.builder.master("local[*]").appName("ICP2_Dhairya_Chandra").enableHiveSupport().getOrCreate()

In [7]:
# Add upload button to upload file from computer on Google Colab

from google.colab import files
files.upload()

Saving icp2.txt to icp2.txt


{'icp2.txt': b'The University of South Carolina reports that more than 1,000 students currently have the virus.\r\nThe C.D.C. tells health officials to be ready to distribute a vaccine by November, raising concerns over politicized timing.\r\nIn Iowa, college students staged a sickout, and a football opener won\xe2\x80\x99t have fans after all.\r\nVirus fallout from the Sturgis motorcycle rally: A death in Minnesota, cases in South Dakota and more.\r\nNew studies show inexpensive steroid drugs can help critically sick people survive Covid-19.\r\nSilvio Berlusconi, Italy\xe2\x80\x99s former prime minister, tests positive.\r\nA judge orders the University of California to stop considering SAT or ACT scores because of the pandemic.'}

In [8]:
# Importing uploaded file as a data frame 

data = spark.read.text('icp2.txt')

In [9]:
data.show()

+--------------------+
|               value|
+--------------------+
|The University of...|
|The C.D.C. tells ...|
|In Iowa, college ...|
|Virus fallout fro...|
|New studies show ...|
|Silvio Berlusconi...|
|A judge orders th...|
+--------------------+



In [10]:
# Converting paragraph to words list

data_words = data.select(explode(split(regexp_replace(trim(lower(col("value"))), "[\$#,’.\t\n-]", ""), "\s+")).alias("value"))

In [11]:
data_words.show()

+----------+
|     value|
+----------+
|       the|
|university|
|        of|
|     south|
|  carolina|
|   reports|
|      that|
|      more|
|      than|
|      1000|
|  students|
| currently|
|      have|
|       the|
|     virus|
|       the|
|       cdc|
|     tells|
|    health|
| officials|
+----------+
only showing top 20 rows



In [12]:
# Displaying the first letter of each word in seperate column

first_char = data_words.select((substring("value",1,1)).alias("first_character"),col("value").alias("word"))

In [13]:
first_char.show()

+---------------+----------+
|first_character|      word|
+---------------+----------+
|              t|       the|
|              u|university|
|              o|        of|
|              s|     south|
|              c|  carolina|
|              r|   reports|
|              t|      that|
|              m|      more|
|              t|      than|
|              1|      1000|
|              s|  students|
|              c| currently|
|              h|      have|
|              t|       the|
|              v|     virus|
|              t|       the|
|              c|       cdc|
|              t|     tells|
|              h|    health|
|              o| officials|
+---------------+----------+
only showing top 20 rows



In [14]:
# Displaying all words starting with letter T in txt document

first_char.filter(first_char.first_character=="t").show()


+---------------+------+
|first_character|  word|
+---------------+------+
|              t|   the|
|              t|  that|
|              t|  than|
|              t|   the|
|              t|   the|
|              t| tells|
|              t|    to|
|              t|    to|
|              t|timing|
|              t|   the|
|              t| tests|
|              t|   the|
|              t|    to|
|              t|   the|
+---------------+------+



In [15]:
# Grouping all words in document with letters their first commmon letter 

words_group = first_char.groupBy("first_character").agg(concat_ws(",", collect_list("word")).alias("grouped_words"))

In [16]:
words_group.show()

+---------------+--------------------+
|first_character|       grouped_words|
+---------------+--------------------+
|              m|more,motorcycle,m...|
|              f|football,fans,fal...|
|              n|        november,new|
|              v| virus,vaccine,virus|
|              o|of,officials,over...|
|              h|have,health,have,...|
|              p|politicized,peopl...|
|              d|distribute,death,...|
|              w|                wont|
|              c|carolina,currentl...|
|              u|university,univer...|
|              i|in,iowa,in,in,ine...|
|              1|                1000|
|              j|               judge|
|              b|be,by,berlusconi,...|
|              r|reports,ready,rai...|
|              a|a,a,and,a,after,a...|
|              t|the,that,than,the...|
|              s|south,students,st...|
+---------------+--------------------+



In [17]:
# Combining all in 1 column

results = words_group.orderBy("first_character").select(concat(col("first_character"),lit(", "),col("grouped_words")).alias("Result"))

In [18]:
results.show()

+--------------------+
|              Result|
+--------------------+
|             1, 1000|
|a, a,a,and,a,afte...|
|b, be,by,berlusco...|
|c, carolina,curre...|
|d, distribute,dea...|
|f, football,fans,...|
|h, have,health,ha...|
|i, in,iowa,in,in,...|
|            j, judge|
|m, more,motorcycl...|
|     n, november,new|
|o, of,officials,o...|
|p, politicized,pe...|
|r, reports,ready,...|
|s, south,students...|
|t, the,that,than,...|
|u, university,uni...|
|v, virus,vaccine,...|
|             w, wont|
+--------------------+



In [19]:
# Saving the results in output.txt file

results.coalesce(1).write.format("text").option("header", "false").mode("append").save("output.txt")