# **Count Vectorizer**

## **Install packages if not yet installed**

In [1]:
import sys

!{sys.executable} -m pip install cassandra-driver # Cassandra



In [2]:
%run ../EnvironmentVariablesSetup.ipynb

## **Read database from Keyspaces using PySpark**

**1.** Download the required jar files (`spark-cassandra-connector_2.12-3.3.0.jar, spark-cassandra-connector-assembly_2.12-3.3.0.jar`).

**2.** Download your `cassandra_truststore.jks` file.

**3.** Create `application.conf` file.

**4.** Create `SparkSession` and set the configuration to connect to Keyspaces using service-specific credentials.

**5.** Read all rows from `BasicPreprocessedGFGArticles` table, `GFGArticles` keyspace into PySpark dataframe.

In [3]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
# import pyspark.pandas as ps
# from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
spark=SparkSession.builder.appName("BasicTextPreprocessing")\
    .config("spark.files", "../application.conf")\
    .config("spark.jars", "../jar-files/spark-cassandra-connector_2.12-3.3.0.jar,"
                            "../jar-files/spark-cassandra-connector-assembly_2.12-3.3.0.jar")\
    .getOrCreate()

spark.conf.set("spark.cassandra.connection.config.profile.path", "application.conf")
spark.conf.set("spark.cassandra.connection.ssl.clientAuth.enabled", "true")
spark.conf.set("spark.cassandra.connection.ssl.enabled", "true")
# spark.conf.set("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")

# Spark version
spark.sparkContext.version

23/09/26 21:12:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


'3.3.0'

In [5]:
articles=spark.read\
  .format("org.apache.spark.sql.cassandra")\
  .options(table="BasicPreprocessedGFGArticles", keyspace="GFGArticles")\
  .load()

articles.show(5)

23/09/26 21:12:09 WARN CassandraConnectionFactory: Ignoring all programmatic configuration, only using configuration from application.conf


[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------------------+
|   ID| PreprocessedContent|
+-----+--------------------+
|17036|b b c c answer ex...|
|16625|give two array ta...|
|  188|c strcat function...|
|  564|world programming...|
|11971|article know appr...|
+-----+--------------------+
only showing top 5 rows



                                                                                

## **Convert the preprocessed content into tokens**

In [6]:
from pyspark.ml.feature import Tokenizer

In [7]:
tokenizer=Tokenizer(inputCol="PreprocessedContent", outputCol="Tokens")
articles=tokenizer.transform(articles).toDF("ID", "PreprocessedContent", "Tokens")
articles.show(5)

+-----+--------------------+--------------------+
|   ID| PreprocessedContent|              Tokens|
+-----+--------------------+--------------------+
|31599|give string conta...|[give, string, co...|
|14638|primary memory li...|[primary, memory,...|
| 5992|database offer nu...|[database, offer,...|
|29258|article learn det...|[article, learn, ...|
| 7998|give two string x...|[give, two, strin...|
+-----+--------------------+--------------------+
only showing top 5 rows



## **CountVectorizer**

In [8]:
from pyspark.ml.feature import CountVectorizer

In [9]:
countVec=CountVectorizer(inputCol="Tokens", outputCol="Counts", vocabSize=50000)
countVecModel=countVec.fit(articles)
articles=countVecModel.transform(articles).toDF("ID", "PreprocessedContent", "Tokens", "CountVector")
articles.show(5)

                                                                                

+-----+--------------------+--------------------+--------------------+
|   ID| PreprocessedContent|              Tokens|         CountVector|
+-----+--------------------+--------------------+--------------------+
| 8772|consider follow p...|[consider, follow...|(50000,[8,13,20,2...|
|11346|amcat amcat aspir...|[amcat, amcat, as...|(50000,[9,14,16,1...|
|23825|online code round...|[online, code, ro...|(50000,[0,1,2,4,6...|
|23790|samsung r institu...|[samsung, r, inst...|(50000,[0,1,2,4,6...|
|13740|man command linux...|[man, command, li...|(50000,[0,1,2,4,5...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
# Look into the schema
articles.printSchema()

root
 |-- ID: integer (nullable = false)
 |-- PreprocessedContent: string (nullable = true)
 |-- Tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- CountVector: vector (nullable = true)



In [11]:
# Look into the schema
articles.schema

StructType([StructField('ID', IntegerType(), False), StructField('PreprocessedContent', StringType(), True), StructField('Tokens', ArrayType(StringType(), True), True), StructField('CountVector', VectorUDT(), True)])

## **Transform the `CountVector` into separate columns**

In [12]:
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import StringType, ArrayType, DoubleType, IntegerType
from pyspark.mllib.linalg import VectorUDT, SparseVector

In [13]:
# articles=articles.withColumn("FeaturesCount", udf(lambda countVector : countVector.size, 
#                                          IntegerType())(col("CountVector")))
articles=articles.withColumn("FeaturesIndices", udf(lambda countVector : countVector.indices.tolist(), 
                                           ArrayType(IntegerType()))(col("CountVector")))
articles=articles.withColumn("FeaturesValues", udf(lambda countVector : countVector.values.astype(np.int32).tolist(), 
                                           ArrayType(IntegerType()))(col("CountVector")))
articles.show(5)

[Stage 7:>                                                          (0 + 1) / 1]

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|   ID| PreprocessedContent|              Tokens|         CountVector|     FeaturesIndices|      FeaturesValues|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|23619|give singly link ...|[give, singly, li...|(50000,[0,1,2,3,4...|[0, 1, 2, 3, 4, 8...|[1, 1, 2, 1, 1, 2...|
|32158|recently citrix w...|[recently, citrix...|(50000,[0,2,3,4,6...|[0, 2, 3, 4, 6, 9...|[4, 1, 1, 1, 1, 2...|
|28744|company name epik...|[company, name, e...|(50000,[0,2,4,11,...|[0, 2, 4, 11, 14,...|[1, 3, 2, 1, 1, 3...|
|21534|give two string s...|[give, two, strin...|(50000,[0,1,2,3,4...|[0, 1, 2, 3, 4, 6...|[6, 1, 5, 6, 5, 1...|
|23907|hexadecimal equiv...|[hexadecimal, equ...|(50000,[0,6,9,11,...|[0, 6, 9, 11, 22,...|[2, 3, 2, 1, 1, 2...|
+-----+--------------------+--------------------+--------------------+--------------------+-----

                                                                                

## **Write to a new table in Keyspaces**

In [14]:
# Get the total count of articles.
NO_OF_ARTICLES=articles[["ID"]].filter(articles.ID >= 0).count()
NO_OF_ARTICLES

34550

In [15]:
BATCH_SIZE=1024

In [17]:
for start in range(0, NO_OF_ARTICLES+1, BATCH_SIZE):
    articles[["ID", "FeaturesIndices", "FeaturesValues"]].filter((articles.ID >= start) & (articles.ID < start+BATCH_SIZE))\
    .write.format("org.apache.spark.sql.cassandra")\
    .options(table="CountVectorGFGArticles", keyspace="GFGArticles")\
    .mode("APPEND")\
    .save()
    print(f"Saved articles with IDs between {start} and {start+BATCH_SIZE-1}.")

                                                                                

Saved articles with IDs between 0 and 1023.


                                                                                

Saved articles with IDs between 1024 and 2047.


                                                                                

Saved articles with IDs between 2048 and 3071.


                                                                                

Saved articles with IDs between 3072 and 4095.


                                                                                

Saved articles with IDs between 4096 and 5119.


                                                                                

Saved articles with IDs between 5120 and 6143.


                                                                                

Saved articles with IDs between 6144 and 7167.


                                                                                

Saved articles with IDs between 7168 and 8191.


                                                                                

Saved articles with IDs between 8192 and 9215.


                                                                                

Saved articles with IDs between 9216 and 10239.


                                                                                

Saved articles with IDs between 10240 and 11263.


                                                                                

Saved articles with IDs between 11264 and 12287.


                                                                                

Saved articles with IDs between 12288 and 13311.


                                                                                

Saved articles with IDs between 13312 and 14335.


                                                                                

Saved articles with IDs between 14336 and 15359.


                                                                                

Saved articles with IDs between 15360 and 16383.


                                                                                

Saved articles with IDs between 16384 and 17407.


                                                                                

Saved articles with IDs between 17408 and 18431.


                                                                                

Saved articles with IDs between 18432 and 19455.


                                                                                

Saved articles with IDs between 19456 and 20479.




Saved articles with IDs between 20480 and 21503.


                                                                                

Saved articles with IDs between 21504 and 22527.


                                                                                

Saved articles with IDs between 22528 and 23551.


                                                                                

Saved articles with IDs between 23552 and 24575.


                                                                                

Saved articles with IDs between 24576 and 25599.


                                                                                

Saved articles with IDs between 25600 and 26623.


                                                                                

Saved articles with IDs between 26624 and 27647.


                                                                                

Saved articles with IDs between 27648 and 28671.


                                                                                

Saved articles with IDs between 28672 and 29695.


                                                                                

Saved articles with IDs between 29696 and 30719.


                                                                                

Saved articles with IDs between 30720 and 31743.


                                                                                

Saved articles with IDs between 31744 and 32767.


                                                                                

Saved articles with IDs between 32768 and 33791.




Saved articles with IDs between 33792 and 34815.


                                                                                