In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [16]:
#import các thư viện cần thiết
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
nltk.download('stopwords')
# nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diept\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\diept\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diept\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preprocess funtion

In [18]:
 #Read data
data = pd.read_json(os.path.join("..", "merged", "test.json"))
data.head()


Unnamed: 0,text
0,As a consequence of the global COVID-19 pandem...
1,According to current live statistics at the ti...


In [19]:
def preprocess(data : pd.DataFrame) -> pd.DataFrame : 
  """ Performing data preprocessing
  Agrs : 
    data : input file as pandas.DataFrame
  Returns :
    A pandas.DataFrame which is as vector form of input"""

  #Create a copy file so that we wont change the original data
  cleaned_data = data.copy()
  
#Prepare stopword and lemmatizer
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()

  def clean_text(text):
     #text cleaning, normalization
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '',text)

    #Sentence Segmentation
    sentences = sent_tokenize(text)

    #Tokenization
    tokens = []
    for s in sentences:
      tokens = word_tokenize(s)
       #Text lemmatization, stopword removal
      tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
      tokens.extend(tokens)
    
    return ' '.join(tokens)
  cleaned_data['text_data'] = cleaned_data['text'].apply(clean_text)
  

  #Rare words, common words filtering
  vectorizer = TfidfVectorizer(max_df= 0.9, min_df= 0.2)

  #vecorization using TF-IDF
  tfidf_data = vectorizer.fit_transform(cleaned_data['text_data'])
  
  #Convert to Dataframe
  cleaned_data = pd.DataFrame(tfidf_data.toarray(),columns = vectorizer.get_feature_names_out())
  return cleaned_data



In [20]:
processed_data = preprocess(data)
processed_data.head()

Unnamed: 0,ability,able,academy,acceleration,access,accordance,according,account,accuracy,accurate,...,widely,window,work,working,world,worldwide,would,writing,wtft,wtfts
0,0.058747,0.029374,0.029374,0.029374,0.058747,0.029374,0.0,0.0,0.117495,0.029374,...,0.029374,0.029374,0.0,0.0,0.0,0.0,0.117495,0.0,0.234989,0.029374
1,0.0,0.0,0.0,0.0,0.0,0.0,0.041885,0.041885,0.0,0.0,...,0.0,0.0,0.041885,0.041885,0.041885,0.083771,0.0,0.041885,0.0,0.0


In [21]:
#Output csv
processed_data.to_csv(os.path.join('..','cleaned_data.csv'),index =False)

# Training LDA

In [22]:
!pip install pyspark



In [23]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import col, udf 
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import VectorAssembler

In [24]:
#Initialize Pyspark Session
spark = SparkSession.builder \
    .appName("LDA Topic Modeling") \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [25]:
#Transform pd.DataFrame to pyspark.sql.DataFrame
spark_data = spark.createDataFrame(processed_data) 

#Combine columns into 1 vector column
assembler = VectorAssembler(inputCols= spark_data.columns, outputCol= 'features')
processed_data_vector = assembler.transform(spark_data).select('features')


In [26]:
# Initialize number of topics and max interation
num_topics = 10
max_interation = 100

In [27]:
#Train LDA
lda = LDA(k=num_topics, maxIter=max_interation, featuresCol= 'features')
lda_model = lda.fit(processed_data_vector)
# Show topics 
topics = lda_model.describeTopics(maxTermsPerTopic= 10)
topics.show(truncate=False)

+-----+--------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|termIndices                                       |termWeights                                                                                                                                                                                                                         |
+-----+--------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[46, 372, 146, 7, 508, 540, 437, 298, 234, 74]    |[0.0024123256641410368, 0.00234223749490057, 0.002296095174315453, 0.002267717