<a href="https://colab.research.google.com/github/dhanalakshmi3149/diabatics/blob/main/Classification_through_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark




In [2]:
from sklearn.datasets import fetch_20newsgroups
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import pandas as pd

# Start Spark Session
spark = SparkSession.builder.appName("DocumentClassification").getOrCreate()

# Fetch 20 Newsgroups Data
newsgroups = fetch_20newsgroups(subset='all')

# Convert the dataset to a DataFrame for PySpark processing
data = pd.DataFrame({'text': newsgroups.data, 'category': newsgroups.target})
df = spark.createDataFrame(data)

# Show some details about the data
print(f"Total number of documents: {len(newsgroups.data)}")
print(f"Categories: {newsgroups.target_names}")
print(f"Number of categories: {len(newsgroups.target_names)}")

# Display the distribution of categories
category_counts = df.groupBy('category').count().toPandas()
print("Category distribution before filtering (25%):")
print(category_counts)

# Filter 25% of documents from each category
df_sampled = df.sample(withReplacement=False, fraction=0.25, seed=42)

# Show the number of documents after sampling
total_documents_after_sampling = df_sampled.count()
print(f"Total number of documents after sampling: {total_documents_after_sampling}")

Total number of documents: 18846
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of categories: 20
Category distribution before filtering (25%):
    category  count
0         19    628
1          0    799
2          7    990
3          6    975
4          9    994
5         17    940
6          5    988
7          1    973
8         10    999
9          3    982
10        12    984
11         8    996
12        11    991
13         2    985
14         4    963
15        13    990
16        18    775
17        14    987
18        15    997
19        16    910
Total number of documents after sampling: 4773


In [3]:
# Prepare for Document Classification

# Step 1: Tokenize the text
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Step 2: Apply HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=1000)

# Step 3: Compute IDF (Inverse Document Frequency)
idf = IDF(inputCol="raw_features", outputCol="features")

# Step 4: Convert category labels to numerical labels
indexer = StringIndexer(inputCol="category", outputCol="label")

# Step 5: Define the classifier (Logistic Regression in this case)
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Set up the pipeline with all the stages
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, indexer, lr])

# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data = df_sampled.randomSplit([0.8, 0.2], seed=42)

# Step 6: Train the model using the pipeline
model = pipeline.fit(train_data)

# Step 7: Make predictions on the test data
predictions = model.transform(test_data)

# Show some of the predictions
predictions.select("text", "category", "prediction").show(5, truncate=False)

# Step 8: Evaluate the model's accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Display the accuracy
print(f"Model Accuracy: {accuracy:.2f}")

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
import numpy as np
# Apply the pipeline to the sampled data (df_sampled) to get the 'features' column (TF-IDF vectors)
processed_data = model.transform(df_sampled)

# Extract the "features" column as an RDD
tdm_rdd = processed_data.select("features").rdd.map(lambda row: row[0])

# Convert the RDD of vectors into a numpy array
tdm_array = np.array(tdm_rdd.collect())

# Convert the numpy array into a DataFrame (this is our Term-Document Matrix)
tdm_df = pd.DataFrame(tdm_array)

# Show the Term-Document Matrix
print("Term-Document Matrix (TDM):")
print(tdm_df)

# Optional: Display the first few rows of the TDM
print(tdm_df.head())

Term-Document Matrix (TDM):
           0         1        2    3         4    5        6         7    \
0     0.000000  0.000000  1.90774  0.0  0.000000  0.0  0.00000  0.000000   
1     0.000000  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
2     0.000000  0.000000  0.00000  0.0  2.223593  0.0  2.70702  0.000000   
3     1.983452  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
4     0.000000  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
...        ...       ...      ...  ...       ...  ...      ...       ...   
4768  0.000000  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
4769  1.983452  0.000000  0.00000  0.0  2.223593  0.0  0.00000  0.000000   
4770  0.000000  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
4771  0.000000  0.000000  0.00000  0.0  0.000000  0.0  0.00000  0.000000   
4772  0.000000  2.504946  0.00000  0.0  0.000000  0.0  0.00000  1.840601   

           8         9    ...       990  991       992  993