In [1]:
import glob
import pandas as pd
import numpy as np
import xml.etree.ElementTree as et
import os
from datetime import datetime
import re
from tqdm import tqdm

In [2]:
#Opening out stored filenames and their encodings
with open('Blogs/output.txt','r') as f:
    file =f.read()

In [3]:
data = list(re.split('\n|: |; | |charset=',file))

In [5]:
#removing empty elements in list
while '' in data:
    data.remove('')

In [6]:
len(data)

57963

In [7]:
df = pd.DataFrame(columns='Filename Type Encoding'.split())

In [8]:
df

Unnamed: 0,Filename,Type,Encoding


In [9]:
i=0
while i<len(data):
    filename = data[i]
    i+=1
    type_data = data[i]
    i+=1
    encoding = data[i]
    i+=1
    dict_data = {'Filename':filename,'Type':type_data,'Encoding':encoding}
    df = df.append(dict_data,ignore_index=True)

In [10]:
df.drop(19320,axis=0,inplace=True)

In [11]:
df

Unnamed: 0,Filename,Type,Encoding
0,1000331.female.37.indUnk.Leo.xml,text/plain,us-ascii
1,1000866.female.17.Student.Libra.xml,text/plain,unknown-8bit
2,1004904.male.23.Arts.Capricorn.xml,text/plain,iso-8859-1
3,1005076.female.25.Arts.Cancer.xml,text/plain,us-ascii
4,1005545.male.25.Engineering.Sagittarius.xml,text/plain,utf-8
...,...,...,...
19315,996147.female.36.Telecommunications.Leo.xml,text/plain,us-ascii
19316,997488.male.25.indUnk.Cancer.xml,text/plain,us-ascii
19317,998237.female.16.indUnk.Virgo.xml,text/plain,unknown-8bit
19318,998966.male.27.indUnk.Taurus.xml,text/plain,us-ascii


In [12]:
df['Type'].value_counts()

text/plain                  19239
application/octet-stream       67
text/x-c                       14
Name: Type, dtype: int64

In [13]:
df['Encoding'].value_counts()

us-ascii        11802
utf-8            6859
unknown-8bit      497
iso-8859-1         95
binary             67
Name: Encoding, dtype: int64

In [14]:
df[df['Type']=='application/octet-stream']

Unnamed: 0,Filename,Type,Encoding
123,1107146.female.16.Student.Libra.xml,application/octet-stream,binary
197,1169202.male.26.Science.Libra.xml,application/octet-stream,binary
473,1395900.male.27.indUnk.Taurus.xml,application/octet-stream,binary
514,1427070.female.25.indUnk.Gemini.xml,application/octet-stream,binary
692,1583261.female.16.Arts.Gemini.xml,application/octet-stream,binary
...,...,...,...
18502,4300864.female.26.Religion.Sagittarius.xml,application/octet-stream,binary
18865,536510.female.24.RealEstate.Capricorn.xml,application/octet-stream,binary
19049,741128.male.27.Consulting.Leo.xml,application/octet-stream,binary
19251,925742.male.16.Student.Gemini.xml,application/octet-stream,binary


## The above data shows us that all binary encoding files are basically of the type: application/octet-stream


In [15]:
df[df['Type']=='text/x-c']

Unnamed: 0,Filename,Type,Encoding
31,1031806.male.17.Technology.Sagittarius.xml,text/x-c,unknown-8bit
266,1233897.male.17.Technology.Virgo.xml,text/x-c,unknown-8bit
1225,1956554.male.24.Technology.Pisces.xml,text/x-c,utf-8
3287,3197289.male.15.Student.Scorpio.xml,text/x-c,us-ascii
3327,3208819.male.23.Student.Aries.xml,text/x-c,utf-8
3864,3317054.male.27.Internet.Aquarius.xml,text/x-c,utf-8
4709,3374765.male.26.Engineering.Capricorn.xml,text/x-c,us-ascii
5386,3419733.male.27.Engineering.Pisces.xml,text/x-c,utf-8
7508,3554856.male.27.Technology.Scorpio.xml,text/x-c,us-ascii
8869,3648955.male.17.Student.Cancer.xml,text/x-c,utf-8


## We can see from the above that type text/x-c supports utf-8

### ----------------------------------------------------------------------------------------------------------------------------


## Side Note: Since I had trouble opening the files with different encodings using xml.ElementTree I decided to process files separately according to their encoding type to make sure all the data gets correctly imported

### ----------------------------------------------------------------------------------------------------------------------------


# Process: Importing data from filetype and storing them as a dataframe csv file

## 1. ASCII Filenames

In [16]:
filenames_ascii = list(df[df['Encoding']=='us-ascii']['Filename'])

In [17]:
len(filenames_ascii)

11802

In [18]:
os.chdir('/Users/deepakbuddha/Desktop/scoutbee_proj/blogs')

In [19]:
xmldata_ascii = pd.DataFrame(columns='Id Date Post Gender Age Industry Astrosign'.split())

In [20]:
xmldata_ascii

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign


In [21]:
# Pattern format for the date column
pattern = r"\d{2},\w+,\d{4}"

In [22]:
for file in tqdm(filenames_ascii,desc='Progress'):
    blog = ''
    
    #using errors = "replace" rather than "surrogateescape" as with "replace" we can export data into csv file
    with open(file,errors="replace") as f:
        for line in f:
            blog = blog + line
    
    #removing all the tags
    tempdata = list(re.split('<Blog>|<date>|</date>|<post>|</post>|</Blog>',blog))
    
    #loop to remove unwanted elements which just contain space characters (1 or more) from our list
    blogdata = []
    for ele in tempdata:
        if ele.strip():
            blogdata.append(ele)
            
     #getting the Id, Gender, Age, Profession and AstroSign of that particular blogger
    userid = file.split('.')[0]
    gender = file.split('.')[1]
    age = file.split('.')[2]
    field_of_work = file.split('.')[3]
    ast_sign = file.split('.')[4]

    i=0
    for i in range(len(blogdata)):
        # If our list element contains date format and has less than 20 characters
        # We use less than 20 characters to make sure to omit considering date format from our blogposts
        if ((len(blogdata[i])<20) and (len(re.findall(pattern,blogdata[i]))!=0)):
            date = blogdata[i]
        else:
            post = blogdata[i]
            
            #Since a typical blog post consists of a pair of date and post, we append the values after getting our post
            blog_dict ={'Id':userid,'Date':date, 'Post':post, 'Gender':gender,'Age':age,'Industry':field_of_work,'Astrosign':ast_sign}
            xmldata_ascii = xmldata_ascii.append(blog_dict,ignore_index=True)

Progress: 100%|██████████| 11802/11802 [32:17<00:00,  6.09it/s] 


In [23]:
xmldata_ascii

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign
0,1000331,"31,May,2004","\n\n\t \n Well, everyone got up and going...",female,37,indUnk,Leo
1,1000331,"29,May,2004",\n\n\t \n My four-year old never stops ta...,female,37,indUnk,Leo
2,1000331,"28,May,2004","\n\n\t \n Actually it's not raining yet, ...",female,37,indUnk,Leo
3,1000331,"28,May,2004",\n\n\t \n Ha! Just set up my RSS feed - t...,female,37,indUnk,Leo
4,1000331,"28,May,2004","\n\n\t \n Oh, which just reminded me, we ...",female,37,indUnk,Leo
...,...,...,...,...,...,...,...
175686,998966,"02,July,2004",\n\n\n \n Elijah and Amy&nbsp; urlL...,male,27,indUnk,Taurus
175687,998966,"02,July,2004",\n\n\n \n Amy with Elijah&nbsp; url...,male,27,indUnk,Taurus
175688,998966,"11,July,2004",\n\n\n \n urlLink Can Buddha ge...,male,27,indUnk,Taurus
175689,998966,"11,July,2004",\n\n\n \n urlLink There she is ...,male,27,indUnk,Taurus


In [24]:
xmldata_ascii.to_csv('../CSVData/ASCIIData.csv',index=False)

### ----------------------------------------------------------------------------------------------------------------------------


## 2. UTF8 Filenames

In [25]:
filenames_utf8 = list(df[df['Encoding']=='utf-8']['Filename'])

In [26]:
len(filenames_utf8)

6859

In [27]:
xmldata_utf8 = pd.DataFrame(columns='Id Date Post Gender Age Industry Astrosign'.split())

In [28]:
xmldata_utf8

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign


In [29]:
# Pattern format for the date column
pattern = r"\d{2},\w+,\d{4}"

In [30]:
for file in tqdm(filenames_utf8,desc='Progress'):
    blog = ''
    
    #using errors = "replace" rather than "surrogateescape" as with "replace" we can export data into csv file
    with open(file,errors="replace") as f:
        for line in f:
            blog = blog + line
    
    #removing all the tags
    tempdata = list(re.split('<Blog>|<date>|</date>|<post>|</post>|</Blog>',blog))
    
    #loop to remove unwanted elements which just contain space characters (1 or more) from our list
    blogdata = []
    for ele in tempdata:
        if ele.strip():
            blogdata.append(ele)
            
    #getting the Id, Gender, Age, Profession and AstroSign of that particular blogger
    userid = file.split('.')[0]
    gender = file.split('.')[1]
    age = file.split('.')[2]
    field_of_work = file.split('.')[3]
    ast_sign = file.split('.')[4]

    i=0
    for i in range(len(blogdata)):
        # If our list element contains date format and has less than 20 characters
        # We use less than 20 characters to make sure to omit considering date format from our blogposts
        if ((len(blogdata[i])<20) and (len(re.findall(pattern,blogdata[i]))!=0)):
            date = blogdata[i]
        else:
            post = blogdata[i]
            
            #Since a typical blog post consists of a pair of date and post, we append the values after getting our post
            blog_dict ={'Id':userid,'Date':date, 'Post':post, 'Gender':gender,'Age':age,'Industry':field_of_work,'Astrosign':ast_sign}
            xmldata_utf8 = xmldata_utf8.append(blog_dict,ignore_index=True)

Progress: 100%|██████████| 6859/6859 [1:33:35<00:00,  1.22it/s]  


In [31]:
xmldata_utf8

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign
0,1005545,"05,July,2003",\n\n\t \n B-Logs: The Business Blogs Par...,male,25,Engineering,Sagittarius
1,1005545,"04,July,2003",\n\n\t \n Bohemian Rhapsody : Is it?? ...,male,25,Engineering,Sagittarius
2,1005545,"03,July,2003",\n\n\t \n Entrepreneur Guidelines url...,male,25,Engineering,Sagittarius
3,1005545,"02,July,2003",\n\n\t \n Business Profiles of Some Comp...,male,25,Engineering,Sagittarius
4,1005545,"02,July,2003",\n\n\t \n Multiple Interviews for One Jo...,male,25,Engineering,Sagittarius
...,...,...,...,...,...,...,...
327659,999503,"04,July,2004",\n\n\n \n Today we celebrate our in...,male,25,Internet,Cancer
327660,999503,"03,July,2004","\n\n\n \n Ugh, I think I have aller...",male,25,Internet,Cancer
327661,999503,"02,July,2004","\n\n\n \n ""Science is like sex; occ...",male,25,Internet,Cancer
327662,999503,"01,July,2004",\n\n\n \n urlLink Dog toy or marit...,male,25,Internet,Cancer


In [32]:
xmldata_utf8['Id'].nunique()

6859

In [33]:
xmldata_utf8.to_csv('../CSVData/UTF8Data.csv',index=False)

### ----------------------------------------------------------------------------------------------------------------------------


##  3. iso-8859-1 Filenames

In [34]:
filenames_iso = list(df[df['Encoding']=='iso-8859-1']['Filename'])

In [35]:
len(filenames_iso)

95

In [36]:
xmldata_iso = pd.DataFrame(columns='Id Date Post Gender Age Industry Astrosign'.split())

In [37]:
xmldata_iso

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign


In [38]:
pattern = r"\d{2},\w+,\d{4}"

In [39]:
for file in tqdm(filenames_iso,desc='Progress'):
    blog = ''
    with open(file,errors="replace") as f:
        for line in f:
            blog = blog + line
    
    tempdata = list(re.split('<Blog>|<date>|</date>|<post>|</post>|</Blog>',blog))
    
    blogdata = []
    for ele in tempdata:
        if ele.strip():
            blogdata.append(ele)
            
    userid = file.split('.')[0]
    gender = file.split('.')[1]
    age = file.split('.')[2]
    field_of_work = file.split('.')[3]
    ast_sign = file.split('.')[4]

    i=0
    for i in range(len(blogdata)):
        if ((len(blogdata[i])<20) and (len(re.findall(pattern,blogdata[i]))!=0)):
            date = blogdata[i]
        else:
            post = blogdata[i]
            blog_dict ={'Id':userid,'Date':date, 'Post':post, 'Gender':gender,'Age':age,'Industry':field_of_work,'Astrosign':ast_sign}
            xmldata_iso = xmldata_iso.append(blog_dict,ignore_index=True)

Progress: 100%|██████████| 95/95 [00:20<00:00,  4.72it/s]


In [40]:
xmldata_iso

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign
0,1004904,"19,June,2004","\n\n\t \n cupid,please hear my cry, cupid...",male,23,Arts,Capricorn
1,1004904,"19,June,2004",\n\n\t \n Ijust got back from LA. I neede...,male,23,Arts,Capricorn
2,1004904,"19,June,2004",\n\n\t \n Yesterday thousands of demonstr...,male,23,Arts,Capricorn
3,1004904,"19,June,2004",\n\n\t \n Robert de Niro You are like ...,male,23,Arts,Capricorn
4,1004904,"19,June,2004",\n\n\t \n We're sitting at a little round...,male,23,Arts,Capricorn
...,...,...,...,...,...,...,...
9046,963380,"04,July,2004",\n\n\t \n I was pretty close to right on ...,male,24,Student,Cancer
9047,963380,"04,July,2004",\n\n\t \n On the way home tonight I happe...,male,24,Student,Cancer
9048,963380,"03,July,2004",\n\n\t \n Some things I didn't touch on. ...,male,24,Student,Cancer
9049,963380,"03,July,2004",\n\n\t \n It's been an interesting week. ...,male,24,Student,Cancer


In [41]:
xmldata_iso['Id'].nunique()

95

In [42]:
xmldata_iso.to_csv('../CSVData/ISOData.csv',index=False)

### ----------------------------------------------------------------------------------------------------------------------------


## 4.  binary  Filenames

In [43]:
filenames_bin = list(df[df['Encoding']=='binary']['Filename'])

In [44]:
len(filenames_bin)

67

In [45]:
xmldata_bin = pd.DataFrame(columns='Id Date Post Gender Age Industry Astrosign'.split())

In [46]:
xmldata_bin

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign


In [47]:
pattern = r"\d{2},\w+,\d{4}"

In [48]:
for file in tqdm(filenames_bin,desc='Progress'):
    blog = ''
    with open(file,errors="replace") as f:
        for line in f:
            blog = blog + line
    
    tempdata = list(re.split('<Blog>|<date>|</date>|<post>|</post>|</Blog>',blog))
    
    blogdata = []
    for ele in tempdata:
        if ele.strip():
            blogdata.append(ele)
            
    userid = file.split('.')[0]
    gender = file.split('.')[1]
    age = file.split('.')[2]
    field_of_work = file.split('.')[3]
    ast_sign = file.split('.')[4]

    i=0
    for i in range(len(blogdata)):
        if ((len(blogdata[i])<20) and (len(re.findall(pattern,blogdata[i]))!=0)):
            date = blogdata[i]
        else:
            post = blogdata[i]
            blog_dict ={'Id':userid,'Date':date, 'Post':post, 'Gender':gender,'Age':age,'Industry':field_of_work,'Astrosign':ast_sign}
            xmldata_bin = xmldata_bin.append(blog_dict,ignore_index=True)

Progress: 100%|██████████| 67/67 [00:29<00:00,  2.31it/s]


In [49]:
xmldata_bin

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign
0,1107146,"02,July,2004","\n\n\t \n Lindsey, We're 14, not thirteen...",female,16,Student,Libra
1,1107146,"02,July,2004",\n\n\t \n Joah. god I hate you.\n \n,female,16,Student,Libra
2,1107146,"02,July,2004",\n\n\t \n Just for the record... Rosie no...,female,16,Student,Libra
3,1107146,"02,July,2004",\n\n\t \n *laughs Insanely* JOSH!! YOU HA...,female,16,Student,Libra
4,1107146,"02,July,2004","\n\n\t \n Ok, some ground rules... Any o...",female,16,Student,Libra
...,...,...,...,...,...,...,...
12638,958176,"01,August,2004",\n\n \n \n well here you go; so...,male,17,Non-Profit,Gemini
12639,958176,"01,August,2004","\n\n \n \n Ok, seriously people,...",male,17,Non-Profit,Gemini
12640,958176,"11,August,2004","\n\n \n \n Hhahahahahaha, I laug...",male,17,Non-Profit,Gemini
12641,958176,"10,August,2004","\n\n \n \n Ok, this is my 4th po...",male,17,Non-Profit,Gemini


In [50]:
xmldata_bin['Id'].nunique()

67

In [51]:
xmldata_bin.to_csv('../CSVData/BINData.csv',index=False)

### ----------------------------------------------------------------------------------------------------------------------------


## 5.  unknown-8bit Filenames

In [52]:
filenames_unknown = list(df[df['Encoding']=='unknown-8bit']['Filename'])

In [53]:
len(filenames_unknown)

497

In [54]:
xmldata_unknown = pd.DataFrame(columns='Id Date Post Gender Age Industry Astrosign'.split())

In [55]:
xmldata_unknown

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign


In [56]:
pattern = r"\d{2},\w+,\d{4}"

In [57]:
for file in tqdm(filenames_unknown,desc='Progress'):
    blog = ''
    with open(file,errors="replace") as f:
        for line in f:
            blog = blog + line
    
    tempdata = list(re.split('<Blog>|<date>|</date>|<post>|</post>|</Blog>',blog))
    
    blogdata = []
    for ele in tempdata:
        if ele.strip():
            blogdata.append(ele)
            
    userid = file.split('.')[0]
    gender = file.split('.')[1]
    age = file.split('.')[2]
    field_of_work = file.split('.')[3]
    ast_sign = file.split('.')[4]

    i=0
    for i in range(len(blogdata)):
        if ((len(blogdata[i])<20) and (len(re.findall(pattern,blogdata[i]))!=0)):
            date = blogdata[i]
        else:
            post = blogdata[i]
            blog_dict ={'Id':userid,'Date':date, 'Post':post, 'Gender':gender,'Age':age,'Industry':field_of_work,'Astrosign':ast_sign}
            xmldata_unknown = xmldata_unknown.append(blog_dict,ignore_index=True)

Progress: 100%|██████████| 497/497 [19:55<00:00,  2.41s/it]


In [58]:
xmldata_unknown

Unnamed: 0,Id,Date,Post,Gender,Age,Industry,Astrosign
0,1000866,"23,November,2002","\n\n\t \n Yeah, sorry for not writing fo...",female,17,Student,Libra
1,1000866,"20,November,2002","\n\n\t \n Yeah, so today was ok, late ar...",female,17,Student,Libra
2,1000866,"19,November,2002","\n\n\t \n Yay, Tuesday...no longer Monda...",female,17,Student,Libra
3,1000866,"18,November,2002",\n\n\t \n RAR!\n \n,female,17,Student,Libra
4,1000866,"18,November,2002","\n\n\t \n Thought- OK...so, I'm all for ...",female,17,Student,Libra
...,...,...,...,...,...,...,...
153140,998237,"02,October,2003",\n\n\t \n okay im in the libary at school...,female,16,indUnk,Virgo
153141,998237,"02,October,2003","\n\n\t \n today was a GREAT day, i found ...",female,16,indUnk,Virgo
153142,998237,"01,October,2003",\n\n\t \n F A M E by urlLink spazy...,female,16,indUnk,Virgo
153143,998237,"01,October,2003","\n\n\t \n today was um, confusing?, ive b...",female,16,indUnk,Virgo


In [59]:
xmldata_unknown['Id'].nunique()

497

In [60]:
xmldata_unknown.to_csv('../CSVData/Unknown8bitData.csv',index=False)