In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from google.oauth2 import service_account
from googleapiclient.discovery import build
import io
from googleapiclient.http import MediaIoBaseDownload
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
folder_path = '/content/drive/MyDrive/Data Mining Project/blogs'

pattern = re.compile(r'(\d+)\.(\w+)\.(\d+)\.(\w+)\.(\w+)')

data = []
unicode_errors_list = []
parsing_errors_list = []

def replace_ampersand(text):
    # Replace & with &amp; if it's not already in the &amp; format
    return re.sub(r'&(?!amp;)', '&amp;', text)

file_list = [file for file in os.listdir(folder_path) if re.match(pattern, file)]

for file_name in tqdm(file_list, desc="Processing XML files"):

  match = re.match(pattern, file_name)

  try:
    number, gender, age, genre, zodiac = match.groups()
    xml_file_path = os.path.join(folder_path, file_name)
    with open(xml_file_path, 'r', encoding='utf-8') as file:
      xml_content = file.read()

    # Replace & with &amp; where necessary
    modified_xml_content = replace_ampersand(xml_content)
    tree = ET.ElementTree(ET.fromstring(modified_xml_content))
    root = tree.getroot()

    dates = [date.text for date in root.findall(".//date")]
    posts = [post.text for post in root.findall(".//post")]


    for date, post in zip(dates, posts):
        data.append({
          "Number": number,
          "Gender": gender,
          "Age": age,
          "Post Genre": genre,
          "Zodiac Sign": zodiac,
          "Date": date,
          "Post": post
        })

  except ET.ParseError:
      parsing_errors_list.append(xml_file_path)
      continue
  except UnicodeDecodeError as decode_error:
      unicode_errors_list.append(xml_file_path)
      continue

Processing XML files: 100%|██████████| 18266/18266 [08:52<00:00, 34.32it/s] 


In [4]:
len(parsing_errors_list)

300

In [5]:
len(unicode_errors_list)

563

In [None]:
folder_path = '/content/drive/MyDrive/Data Mining Project/blogs'

pattern = re.compile(r'(\d+)\.(\w+)\.(\d+)\.(\w+)\.(\w+)')

data = []


def replace_ampersand(text):
    # Replace & with &amp; if it's not already in the &amp; format
    return re.sub(r'&(?!amp;)', '&amp;', text)

file_list = [file for file in os.listdir(folder_path) if re.match(pattern, file)]

for file_name in tqdm(file_list, desc="Processing XML files"):

  match = re.match(pattern, file_name)

  try:
    number, gender, age, genre, zodiac = match.groups()
    xml_file_path = os.path.join(folder_path, file_name)
    with open(xml_file_path, 'r', encoding='utf-32') as file:
         xml_content = file.read()

    # Replace & with &amp; where necessary
    modified_xml_content = replace_ampersand(xml_content)

    tree = ET.ElementTree(ET.fromstring(modified_xml_content))
    root = tree.getroot()

    dates = [date.text for date in root.findall(".//date")]
    posts = [post.text for post in root.findall(".//post")]


    for date, post in zip(dates, posts):
        data.append({
          "Number": number,
          "Gender": gender,
          "Age": age,
          "Post Genre": genre,
          "Zodiac Sign": zodiac,
          "Date": date,
          "Post": post
        })

  except ET.ParseError:
      print(f"Error parsing XML file: {xml_file_path}")
      continue

In [None]:
file_list[521]

'4286996.female.14.indUnk.Libra.xml'

In [6]:
parsing_errors_list

['/content/drive/MyDrive/Data Mining Project/blogs/4299910.male.17.Student.Cancer.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4300864.female.26.Religion.Sagittarius.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4307329.female.16.Student.Libra.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4312209.female.16.indUnk.Virgo.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4177529.male.16.Student.Aries.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4180104.male.16.indUnk.Scorpio.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4182405.male.23.Government.Capricorn.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4197491.male.16.Education.Scorpio.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4199639.female.17.indUnk.Gemini.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4201404.male.27.indUnk.Cancer.xml',
 '/content/drive/MyDrive/Data Mining Project/blogs/4211779.female.13.Student.Leo.xml',
 '/content/drive

In [None]:
df = pd.DataFrame(data)


In [None]:
df

Unnamed: 0,Number,Gender,Age,Post Genre,Zodiac Sign,Date,Post
0,4244359,female,38,indUnk,Libra,"21,August,2004",\n\n \n I didn't sleep much at all la...
1,4244359,female,38,indUnk,Libra,"16,August,2004",\n\n \n I watched Oprah the other day...
2,4244359,female,38,indUnk,Libra,"16,August,2004",\n\n \n I often wonder if life is not...
3,4244359,female,38,indUnk,Libra,"16,August,2004",\n\n \n I'm not quite sure what poss...
4,4244468,male,14,indUnk,Libra,"16,August,2004",\n\n\t \n This is probably gona be the la...
...,...,...,...,...,...,...,...
453872,3019173,female,23,Telecommunications,Capricorn,"07,May,2004",\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t I had my weig...
453873,3019173,female,23,Telecommunications,Capricorn,"17,June,2004",\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t Life has a wa...
453874,3019173,female,23,Telecommunications,Capricorn,"08,June,2004",\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t Good morning ...
453875,3019173,female,23,Telecommunications,Capricorn,"28,July,2004",\n\n\t\t\t\t\t\t\n\t\t\t\t\t\t My pain is al...


In [None]:
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        print(file_name)

