**Installing Java, Spark and Findspark**

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

**Setting the location where Spark and Java are installed**

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

**Starting a local Spark Session**

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark import SparkConf
from pyspark.context import SparkContext

In [0]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.ml.clustering import KMeans

In [0]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
sqlContext = SQLContext(sc)

**Importing the text files**

In [40]:
from google.colab import files

data_files = files.upload()

Saving reut2-000.sgm to reut2-000.sgm
Saving reut2-001.sgm to reut2-001.sgm
Saving reut2-002.sgm to reut2-002.sgm
Saving reut2-003.sgm to reut2-003.sgm
Saving reut2-004.sgm to reut2-004.sgm
Saving reut2-005.sgm to reut2-005.sgm
Saving reut2-006.sgm to reut2-006.sgm
Saving reut2-007.sgm to reut2-007.sgm
Saving reut2-008.sgm to reut2-008.sgm
Saving reut2-009.sgm to reut2-009.sgm
Saving reut2-010.sgm to reut2-010.sgm
Saving reut2-011.sgm to reut2-011.sgm
Saving reut2-012.sgm to reut2-012.sgm
Saving reut2-013.sgm to reut2-013.sgm
Saving reut2-014.sgm to reut2-014.sgm
Saving reut2-015.sgm to reut2-015.sgm
Saving reut2-016.sgm to reut2-016.sgm
Saving reut2-017.sgm to reut2-017.sgm
Saving reut2-018.sgm to reut2-018.sgm
Saving reut2-019.sgm to reut2-019.sgm
Saving reut2-020.sgm to reut2-020.sgm
Saving reut2-021.sgm to reut2-021.sgm


In [0]:
data_files = sc.wholeTextFiles("*.sgm")

**Converting the 22 files into lowercase**

In [0]:
def lc(input):
    output = []
    for x in input:
        output.append(x.lower())
    return output

In [0]:
data_lc = data_files.map(lambda x: lc(x))

**Transforming the lowercased files to TF-IDF**

In [0]:
lc_df = sqlContext.createDataFrame(data_lc,["fileName","data"])
lc_df_2 = (lc_df.rdd.map(lambda x : (x.fileName,x.data.split(" "))).toDF().withColumnRenamed("_1","fileName").withColumnRenamed("_2","text"))

In [0]:
hashing = MLHashingTF(inputCol="text", outputCol="tf",numFeatures=9500)
lc_tf = hashing.transform(lc_df_2)

In [0]:
tf_idf = MLIDF(inputCol="tf", outputCol="features")
lc_tf_idf = tf_idf.fit(lc_tf).transform(lc_tf)

**Clustering the files into 4 groups using K-Means Clustering**

In [0]:
K_means = KMeans(k=4, seed=123)
k_model_fit = K_means.fit(lc_tf_idf.select("features"))

In [101]:
transform_matrix = k_model_fit.transform(lc_tf_idf).select("fileName", "prediction")
transform_matrix.show()

+--------------------+----------+
|            fileName|prediction|
+--------------------+----------+
|file:/content/reu...|         3|
|file:/content/reu...|         1|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         0|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
|file:/content/reu...|         3|
+--------------------+----------+
only showing top 20 rows



In [0]:
k_means_cluster = transform_matrix.rdd
final_clusters = k_means_cluster.map( lambda x:  [x[0].rsplit('/', 1)[-1], x[1]])

**Exporting the output in the form of "Filename, Clusters"**

In [0]:
df = sqlContext.createDataFrame(final_clusters, ['filename', 'prediction'])

In [0]:
df.coalesce(1).write.format('com.databricks.spark.csv').options(header='true').save('hw4_cluster/')