Mounted on Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

这些文件名是博客文章的标识符，其格式通常为ID.gender.age.occupation.zodiac.xml。

ID：文章的唯一标识符。

gender：作者的性别。

age：作者的年龄。

occupation：作者的职业或行业。

zodiac：作者的星座。


文件名中的星座可以是以下之一：

Aries（白羊座）
Taurus（金牛座）
Gemini（双子座）
Cancer（巨蟹座）
Leo（狮子座）
Virgo（处女座）
Libra（天秤座）
Scorpio（天蝎座）
Sagittarius（射手座）
Capricorn（摩羯座）
Aquarius（水瓶座）
Pisces（双鱼座）


从一系列博客文章中提取最流行的主题可以通过以下步骤和技术实现：

1. **数据预处理**：首先对博客文章进行数据预处理，包括文本清洗、分词、去停用词等操作，以准备好数据进行后续的分析。

2. **词袋模型（Bag of Words）**：将每篇文章表示为词袋模型，即将文章中的词语转换为向量表示。这可以通过词频向量或TF-IDF（词频-逆文档频率）向量来实现。

3. **主题建模**：使用主题建模技术，如Latent Dirichlet Allocation (LDA)，来发现博客文章中的潜在主题。LDA模型可以将每篇文章表示为主题的分布，并发现文本中的隐藏主题。

4. **主题选择**：根据模型输出的主题分布，选择出现频率最高的主题作为最流行的主题。可以使用一些阈值或算法来确定哪些主题是最流行的。

5. **可视化分析**：对提取的主题进行可视化分析，例如绘制主题分布图、词云图等，以便更直观地理解和展示主题的特征和关联。

6. **评估和验证**：最后，对提取的主题进行评估和验证，可以通过人工检查和领域专家的参与来验证提取的主题是否准确和可靠。

通过以上步骤，可以从一系列博客文章中提取出最流行的主题，并对主题进行分析和解释，从而帮助理解文本数据中的重要信息和趋势。



# Prepare the Data

1.Read the zip file contents

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import os
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
import re
import os
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor


directory = "/content/drive/MyDrive/Colab Notebooks/TextMining/report/"
files_dict = {}
import zipfile
output = ''
with zipfile.ZipFile(directory+'Assignment2BlogData.zip', "r") as z:
  #literate the file content by filename
  for filename in z.namelist(): #namelist() get filename list
    with z.open(filename) as f:
      if not filename.endswith('.xml'):
          continue

      xml_content_bytes = f.read()
      xml_content = ""
      try:
          xml_content = xml_content_bytes.decode('utf-8')
      except UnicodeDecodeError:
          xml_content = xml_content_bytes.decode('latin-1')

      #extract the content from the tag
      posts = re.findall(r'<post>(.*?)</post>', xml_content, re.DOTALL)
      #combine the content to a string, and join by '.'
      combined_content = '. '.join(posts)
      #dict:{filename, file content}
      files_dict[filename] = combined_content
      #print(filename)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/TextMining/report/Assignment2BlogData.zip'

2. Demographics: Classified into the following categories through filename

*   Males
*   Females
*   Age brackets <=20
*   Age brackets >20
*   Students
*   Everyone

filename format: ID.gender.age.occupation.zodiac.xml

In [1]:
#Defined categories condition, and returned boolean value. Check whether the filename belong to this category
categories = {
    'Males': lambda filename: filename.split('.')[1].lower() == 'male',
    'Females': lambda filename: filename.split('.')[1].lower() == 'female',
    'Age <= 20': lambda filename: int(filename.split('.')[2]) <= 20,
    'Age > 20': lambda filename: int(filename.split('.')[2]) > 20,
    'Students': lambda filename: filename.split('.')[3].lower() == 'student',
    'Everyone': lambda filename: True
}


def get_file_by_categorize(category_name, files_dict):
    documents_classified = []
    category_condition = categories[category_name]
    for file_path, content in files_dict.items():
      fName = os.path.basename(file_path) #filename
      #filename belong to which category
      if category_condition(fName):
          documents_classified.append(content)
    return documents_classified

# Count each category
for category_name in categories:
    documents = get_file_by_categorize(category_name, files_dict)
    print(f'{category_name}: {len(documents)} posts')



NameError: name 'files_dict' is not defined

# Data Processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from spacy.lang.en.stop_words  import STOP_WORDS as SPACY_STROP_WORDS
from sklearn.feature_extraction import text as sklearn_text


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

#stopwords：nltk.corpus + define stop words + sklearn.stopwords + spacy.stopwords
my_stop_words=None
my_stop_words = stopwords.words('english')
newStopWords = ['it', 'its', 'when',',',':',';'] # add your own stop words to the list here.
my_stop_words = my_stop_words + newStopWords
my_stop_words = sklearn_text.ENGLISH_STOP_WORDS.union(my_stop_words) # append stop_words
my_stop_words = SPACY_STROP_WORDS.union(my_stop_words)  #append stop_words


def remove_stop_words(document):
  global my_stop_words
  #tokenize
  #word_tokens = word_tokenize(document)
  word_tokens = word_tokenize(document)

  filtered_sentence = [w for w in word_tokens if not w.lower() in my_stop_words]
  return ' '.join(filtered_sentence)



def clean_and_stem(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Remove URLs
    document = re.sub(r'https?://\S+|www\.\S+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Tokenize and remove stop words
    #document = remove_stop_words(document)
    document = word_tokenize(document)
    document = [w for w in document if not w.lower() in my_stop_words]


    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    document = [lemmatizer.lemmatize(word) for word in document]
    document = ' '.join(document)

    return document



# Use ThreadPoolExecutor to parallelize processing
with ThreadPoolExecutor() as executor:
    # Process each record in files_dict in parallel
    for file_path, content in list(files_dict.items())[:100]:
        # Define the cleaning and stemming task inline
        cleaned_content = clean_and_stem(content)
        # Update files_dict with the cleaned content
        files_dict[file_path] = cleaned_content
        #print("-" * 50)  # 分隔线
        #print(cleaned_content)




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# try a category
preprocessed_corpus = get_file_by_categorize(5,files_dict)
# Strategy 2: TF-IDF Vectorization to identify dominant topics
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_corpus)

# Apply NMF for topic extraction
num_topics = 2  # Number of topics to extract
nmf_model = NMF(n_components=num_topics)
nmf_model.fit(tfidf_matrix)

# Extract top topics
top_topics = []
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    top_feature_idxs = topic.argsort()[:-10-1:-1]  # Top 10 features for each topic
    top_features = [feature_names[i] for i in top_feature_idxs]
    top_topics.append(top_features)

# Output clauses containing the dominant topics
for topic_idx, topic in enumerate(top_topics):
    print(f"Top words for Topic {topic_idx + 1}: {topic}")
    # Extract and output clauses containing these words
    for document in preprocessed_corpus:
        if any(word in document for word in topic):
            # Output the clause
            print(f"Document: {document}")


Top words for Topic 1: ['like', 'one', 'well', 'really', 'life', 'go', 'thing', 'get', 'lol', 'got']
Document: impressed even animal prefer local spirit mass market one take story bear guzzled 36 beer preferred local brew especially enjoyed reading caught beer doughnut go show many different way show pride quote surf band 80 local http www cnn com 2004 u west 08 18 bear beer reut index html like tire easily crap nbc want see olympics channel cable would think could see sport past nbc refused air fencing one sport really interest ala get snippet get see men sabre final new wireless technology american team show story build following sport one original olympic sport even blunder 1984 first american ever win medal fencing shrugged countless signature petition signed begging nbc show fencing still refuse justice want show thing year afer year year imagine disney contract abc espn channel would result one could wonder link find happened earlier http news bbc co uk sport2 hi olympics_2004 fe

In [None]:
#先拿一个类别做实验
documents = get_file_by_categorize(5,files_dict)

# 文本向量化
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(documents)

# 定义LDA模型并拟合数据
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X)

# 提取主题
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

# 打印每个主题的前十个关键词
print("\nTopics in LDA model:")
tf_feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, 10)



Topics in LDA model:
Topic #0:
like thing time know good really people life day make

Topic #1:
like urllink went day today den lol got time haiz



In [None]:
import re

# 定义文本
text = """
<post>This is the first post.</post>
<post>This is the second post.</post>
<post>This is the third post.</post>
"""

# 使用正则表达式匹配 <post> 和 </post> 之间的文本
posts = re.findall(r'<post>(.*?)</post>', text, re.DOTALL)

# 用空格连接匹配到的文本
result = ' '.join(posts)

print(result)


This is the first post. This is the second post. This is the third post.


In [None]:
#unzip
#!unzip "/content/drive/MyDrive/Colab Notebooks/TextMining/report/Assignment2BlogData.zip" -d "/content/drive/MyDrive/Colab Notebooks/TextMining/report/"