In [1]:
spark

## Processing `PostHistory.xml`

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [3]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/'
dataset_comments = f"{dataset_bucket}/PostHistory.xml"

In [4]:
rdd = spark.sparkContext.textFile(dataset_comments)


In [29]:
def row_parser(row):
    row_len = len(row.split('"')) 
    result = [None] * 10

    if row_len == 11:
        result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  None,
                  None,
                  None,
                  None,
                  None
                )  
        
    elif row_len == 13:
        result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  int(row.split('"')[11]) if row.split('"')[11] and row.split('"')[10].strip() == 'UserId=' else None, 
                  None,
                  None,
                  None,
                  None
                )  
    
    elif row_len == 15:
        result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  int(row.split('"')[11]) if row.split('"')[11] and row.split('"')[10].strip() == 'UserId=' else None, 
                  None,
                  None,
                  None,
                  row.split('"')[13] if row.split('"')[13] else None
                )     
        
    elif row_len == 17:
         result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  int(row.split('"')[11]) if row.split('"')[11] and row.split('"')[10].strip() == 'UserId=' else None, 
                  None,
                  None,
                  row.split('"')[13] if row.split('"')[13] else None, 
                  row.split('"')[15] if row.split('"')[15] else None 
                )    

    elif row_len == 19:
         result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  int(row.split('"')[11]) if row.split('"')[11] and row.split('"')[10].strip() == 'UserId=' else None, 
                  row.split('"')[13] if row.split('"')[13] else None, 
                  None,
                  row.split('"')[15] if row.split('"')[15] else None, 
                  row.split('"')[17] if row.split('"')[17] else None
                ) 

    elif row_len == 21:
         result = (int(row.split('"')[1]) if row.split('"')[1] else None, 
                  int(row.split('"')[3]) if row.split('"')[3] else None, 
                  int(row.split('"')[5]) if row.split('"')[5] else None, 
                  row.split('"')[7] if row.split('"')[7] else None,
                  datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), 
                  int(row.split('"')[11]) if row.split('"')[11] and row.split('"')[10].strip() == 'UserId=' else None, 
                  row.split('"')[13] if row.split('"')[13] else None, 
                  row.split('"')[15] if row.split('"')[15] else None, 
                  row.split('"')[17] if row.split('"')[17] else None,
                  row.split('"')[19] if row.split('"')[17] else None
                ) 
         
    return result

In [34]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

In [None]:
# Define the schema for the DataFrame
schema_posthistory = StructType([
    StructField("Id", LongType()),
    StructField("PostHistoryTypeId", LongType()),
    StructField("PostId", LongType()),
    StructField("RevisionGUID", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("UserId", LongType()),
    StructField("UserDisplayName", StringType()),
    StructField("Comment", StringType()),
    StructField("Text", StringType()),
    StructField("ContentLicense", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_posthistory)

# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/PostHistory-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

df.show()

In [5]:
rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()).take(5)

                                                                                

['Id="6" PostHistoryTypeId="2" PostId="7" RevisionGUID="c30df0f4-a2d9-426e-a2dd-2cc3aa4d9205" CreationDate="2008-07-31T22:17:57.883" UserId="9" Text="The explicit cast to double in the first answer isn\'t necessary - identifying the constant as 5000.0 (or as 5000d) is sufficient." ContentLicense="CC BY-SA 2.5"',
 'Id="12" PostHistoryTypeId="1" PostId="17" RevisionGUID="0421fb42-a29a-4cb2-84ba-a828725410f8" CreationDate="2008-08-01T05:09:55.993" UserId="2" Text="Binary Data in MYSQL" ContentLicense="CC BY-SA 2.5"',
 'Id="13" PostHistoryTypeId="3" PostId="17" RevisionGUID="0421fb42-a29a-4cb2-84ba-a828725410f8" CreationDate="2008-08-01T05:09:55.993" UserId="2" Text="&lt;database&gt;&lt;mysql&gt;" ContentLicense="CC BY-SA 2.5"',
 'Id="14" PostHistoryTypeId="2" PostId="17" RevisionGUID="0421fb42-a29a-4cb2-84ba-a828725410f8" CreationDate="2008-08-01T05:09:55.993" UserId="2" Text="How do I store binary data in mysql?" ContentLicense="CC BY-SA 2.5"',
 'Id="16" PostHistoryTypeId="2" PostId="18"

In [9]:
a = 'Id="6" PostHistoryTypeId="2" PostId="7" RevisionGUID="c30df0f4-a2d9-426e-a2dd-2cc3aa4d9205" CreationDate="2008-07-31T22:17:57.883" UserId="9" Text="The explicit cast to double in the first answer isn\'t necessary - identifying the constant as 5000.0 (or as 5000d) is sufficient." ContentLicense="CC BY-SA 2.5"'
b = 'Id="16" PostHistoryTypeId="2" PostId="18" RevisionGUID="0cfdfa19-039f-4645-8a48-1c316543b98f" CreationDate="2008-08-01T05:12:44.193" UserDisplayName="phpguy" Text="For a table like this:&#xD;&#xA;&#xD;&#xA;    CREATE TABLE binary_data (&#xD;&#xA;    id INT(4) NOT NULL AUTO_INCREMENT PRIMARY KEY,&#xD;&#xA;    description CHAR(50),&#xD;&#xA;    bin_data LONGBLOB,&#xD;&#xA;    filename CHAR(50),&#xD;&#xA;    filesize CHAR(50),&#xD;&#xA;    filetype CHAR(50)&#xD;&#xA;    );&#xD;&#xA;&#xD;&#xA;Here is a PHP example:&#xD;&#xA;&#xD;&#xA;    &lt;?php&#xD;&#xA;    &#xD;&#xA;    // store.php3 - by Florian Dittmer &lt;dittmer@gmx.net&gt;&#xD;&#xA;    // Example php script to demonstrate the storing of binary files into&#xD;&#xA;    // an sql database. More information can be found at http://www.phpbuilder.com/&#xD;&#xA;    ?&gt;&#xD;&#xA;    &#xD;&#xA;    &lt;html&gt;&#xD;&#xA;    &lt;head&gt;&lt;title&gt;Store binary data into SQL Database&lt;/title&gt;&lt;/head&gt;&#xD;&#xA;    &lt;body&gt;&#xD;&#xA;    &#xD;&#xA;    &lt;?php&#xD;&#xA;    // code that will be executed if the form has been submitted:&#xD;&#xA;    &#xD;&#xA;    if ($submit) {&#xD;&#xA;    &#xD;&#xA;        // connect to the database&#xD;&#xA;        // (you may have to adjust the hostname,username or password)&#xD;&#xA;    &#xD;&#xA;        MYSQL_CONNECT(&quot;localhost&quot;,&quot;root&quot;,&quot;password&quot;);&#xD;&#xA;        mysql_select_db(&quot;binary_data&quot;);&#xD;&#xA;    &#xD;&#xA;        $data = addslashes(fread(fopen($form_data, &quot;r&quot;), filesize($form_data)));&#xD;&#xA;    &#xD;&#xA;        $result=MYSQL_QUERY(&quot;INSERT INTO binary_data (description,bin_data,filename,filesize,filetype) &quot;.&#xD;&#xA;            &quot;VALUES (\'$form_description\',\'$data\',\'$form_data_name\',\'$form_data_size\',\'$form_data_type\')&quot;);&#xD;&#xA;    &#xD;&#xA;        $id= mysql_insert_id();&#xD;&#xA;        print &quot;&lt;p&gt;This file has the following Database ID: &lt;b&gt;$id&lt;/b&gt;&quot;;&#xD;&#xA;    &#xD;&#xA;        MYSQL_CLOSE();&#xD;&#xA;    &#xD;&#xA;    } else {&#xD;&#xA;    &#xD;&#xA;        // else show the form to submit new data:&#xD;&#xA;    ?&gt;&#xD;&#xA;    &#xD;&#xA;        &lt;form method=&quot;post&quot; action=&quot;&lt;?php echo $PHP_SELF; ?&gt;&quot; enctype=&quot;multipart/form-data&quot;&gt;&#xD;&#xA;        File Description:&lt;br&gt;&#xD;&#xA;        &lt;input type=&quot;text&quot; name=&quot;form_description&quot;  size=&quot;40&quot;&gt;&#xD;&#xA;        &lt;input type=&quot;hidden&quot; name=&quot;MAX_FILE_SIZE&quot; value=&quot;1000000&quot;&gt;&#xD;&#xA;        &lt;br&gt;File to upload/store in database:&lt;br&gt;&#xD;&#xA;        &lt;input type=&quot;file&quot; name=&quot;form_data&quot;  size=&quot;40&quot;&gt;&#xD;&#xA;        &lt;p&gt;&lt;input type=&quot;submit&quot; name=&quot;submit&quot; value=&quot;submit&quot;&gt;&#xD;&#xA;        &lt;/form&gt;&#xD;&#xA;    &#xD;&#xA;    &lt;?php&#xD;&#xA;    &#xD;&#xA;    }&#xD;&#xA;    &#xD;&#xA;    ?&gt;&#xD;&#xA;    &#xD;&#xA;    &lt;/body&gt;&#xD;&#xA;    &lt;/html&gt;" ContentLicense="CC BY-SA 2.5"'

for i, j in enumerate(a.split('"')):
    print(i, j)
    

0 Id=
1 6
2  PostHistoryTypeId=
3 2
4  PostId=
5 7
6  RevisionGUID=
7 c30df0f4-a2d9-426e-a2dd-2cc3aa4d9205
8  CreationDate=
9 2008-07-31T22:17:57.883
10  UserId=
11 9
12  Text=
13 The explicit cast to double in the first answer isn't necessary - identifying the constant as 5000.0 (or as 5000d) is sufficient.
14  ContentLicense=
15 CC BY-SA 2.5
16 


In [10]:
for i, j in enumerate(b.split('"')):
    print(i, j)

0 Id=
1 16
2  PostHistoryTypeId=
3 2
4  PostId=
5 18
6  RevisionGUID=
7 0cfdfa19-039f-4645-8a48-1c316543b98f
8  CreationDate=
9 2008-08-01T05:12:44.193
10  UserDisplayName=
11 phpguy
12  Text=
13 For a table like this:&#xD;&#xA;&#xD;&#xA;    CREATE TABLE binary_data (&#xD;&#xA;    id INT(4) NOT NULL AUTO_INCREMENT PRIMARY KEY,&#xD;&#xA;    description CHAR(50),&#xD;&#xA;    bin_data LONGBLOB,&#xD;&#xA;    filename CHAR(50),&#xD;&#xA;    filesize CHAR(50),&#xD;&#xA;    filetype CHAR(50)&#xD;&#xA;    );&#xD;&#xA;&#xD;&#xA;Here is a PHP example:&#xD;&#xA;&#xD;&#xA;    &lt;?php&#xD;&#xA;    &#xD;&#xA;    // store.php3 - by Florian Dittmer &lt;dittmer@gmx.net&gt;&#xD;&#xA;    // Example php script to demonstrate the storing of binary files into&#xD;&#xA;    // an sql database. More information can be found at http://www.phpbuilder.com/&#xD;&#xA;    ?&gt;&#xD;&#xA;    &#xD;&#xA;    &lt;html&gt;&#xD;&#xA;    &lt;head&gt;&lt;title&gt;Store binary data into SQL Database&lt;/title&gt;&lt;/head&g

In [11]:
rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .filter(lambda row : len(row.split('"')) == 17) \
   .count()

                                                                                

80550548

In [12]:
117255126 - 80550548

36704578

In [13]:
rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .filter(lambda row : len(row.split('"')) > 17) \
   .count()

                                                                                

32645778

In [14]:
for i in range(15, 25):
    temp = rdd.map(lambda row: row.strip()) \
               .filter(lambda row: row.startswith("<row")) \
               .map(lambda row: row[4:-3]) \
               .map(lambda row: row.strip()) \
               .filter(lambda row : len(row.split('"')) == i)
    
    print(i, temp.count(), temp.take(1))
    print("**************")
    
    

                                                                                

15 3840818 ['Id="40409" PostHistoryTypeId="3" PostId="29995" RevisionGUID="51aba189-62d5-42f6-9fb0-ebca2d7924e5" CreationDate="2008-08-27T12:46:52.607" UserId="1736" ContentLicense="CC BY-SA 2.5"']
**************


                                                                                

16 0 []
**************


                                                                                

17 80550548 ['Id="6" PostHistoryTypeId="2" PostId="7" RevisionGUID="c30df0f4-a2d9-426e-a2dd-2cc3aa4d9205" CreationDate="2008-07-31T22:17:57.883" UserId="9" Text="The explicit cast to double in the first answer isn\'t necessary - identifying the constant as 5000.0 (or as 5000d) is sufficient." ContentLicense="CC BY-SA 2.5"']
**************


                                                                                

18 0 []
**************


                                                                                

19 32644040 ['Id="99" PostHistoryTypeId="2" PostId="111" RevisionGUID="fa450384-b223-4a35-9664-1c7d73b9e437" CreationDate="2008-08-01T15:27:23.093" UserId="146270" UserDisplayName="Diago" Text="I have been using the 64Bit version of TortoiseSVN for ages and I have never had issues with it on Windows 64Bit or Vista 64Bit. I am currently not aware of any other similiar SVN clients that do work on Vista. Is it possible the problem could lie within the configuration of TortoiseSVN or even the installation of Vista? Is the problem occurring on Vista native or SP 1?" ContentLicense="CC BY-SA 2.5"']
**************


                                                                                

20 0 []
**************


                                                                                

21 1738 ['Id="70648" PostHistoryTypeId="5" PostId="50472" RevisionGUID="e297a272-235c-418d-aadf-c2b30ba8cd73" CreationDate="2008-09-08T19:30:07.000" UserId="4549416" UserDisplayName="malach" Comment="added 31 characters in body" Text="Considering the comment of wcm (top value = xfd), you can calculate it like this;&#xD;&#xA;&#xD;&#xA;    function IntToExcel(n: Integer); string;&#xD;&#xA;    begin&#xD;&#xA;       Result := \'\';&#xD;&#xA;       for i := 2 down to 0 do &#xD;&#xA;       begin&#xD;&#xA;          if ((n div 26^i)) &gt; 0) or (i = 0) then&#xD;&#xA;             Result := Result + Char(Ord(\'A\')+(n div (26^i)) - IIF(i&gt;0;1;0));&#xD;&#xA;          n := n mod (26^i);&#xD;&#xA;       end;&#xD;&#xA;    end;&#xD;&#xA;&#xD;&#xA;There are 26 characters in the alphabet and we have a number system just like hex or binary, just with an unusual character set (A..Z), representing positionally the powers of 26: (26^2)(26^1)(26^0)." ContentLicense="CC BY-SA 2.5"']
**************


                                                                                

22 0 []
**************


                                                                                

23 0 []
**************




24 0 []
**************


                                                                                

In [15]:
for i in range(10, 15):
    temp = rdd.map(lambda row: row.strip()) \
               .filter(lambda row: row.startswith("<row")) \
               .map(lambda row: row[4:-3]) \
               .map(lambda row: row.strip()) \
               .filter(lambda row : len(row.split('"')) == i)
    
    print(i, temp.count(), temp.take(1))
    print("**************")
    

                                                                                

10 0 []
**************


                                                                                

11 696 ['Id="1336320" PostHistoryTypeId="16" PostId="725342" RevisionGUID="d988c42a-8745-4d80-914f-3eff76857ba6" CreationDate="2009-04-07T12:05:32.483"']
**************


                                                                                

12 0 []
**************


                                                                                

13 217286 ['Id="1326438" PostHistoryTypeId="16" PostId="720426" RevisionGUID="3ac4c3ae-17f7-442a-90a3-3271cdab8f37" CreationDate="2009-04-06T07:03:46.913" UserId="3385"']
**************




14 0 []
**************


                                                                                

In [18]:
a = 'Id="70648" PostHistoryTypeId="5" PostId="50472" RevisionGUID="e297a272-235c-418d-aadf-c2b30ba8cd73" CreationDate="2008-09-08T19:30:07.000" UserId="4549416" UserDisplayName="malach" Comment="added 31 characters in body" Text="Considering the comment of wcm (top value = xfd), you can calculate it like this;&#xD;&#xA;&#xD;&#xA;    function IntToExcel(n: Integer); string;&#xD;&#xA;    begin&#xD;&#xA;       Result := \'\';&#xD;&#xA;       for i := 2 down to 0 do &#xD;&#xA;       begin&#xD;&#xA;          if ((n div 26^i)) &gt; 0) or (i = 0) then&#xD;&#xA;             Result := Result + Char(Ord(\'A\')+(n div (26^i)) - IIF(i&gt;0;1;0));&#xD;&#xA;          n := n mod (26^i);&#xD;&#xA;       end;&#xD;&#xA;    end;&#xD;&#xA;&#xD;&#xA;There are 26 characters in the alphabet and we have a number system just like hex or binary, just with an unusual character set (A..Z), representing positionally the powers of 26: (26^2)(26^1)(26^0)." ContentLicense="CC BY-SA 2.5"'

for i, j in enumerate(a.split('"')):
    print(i, j)

0 Id=
1 70648
2  PostHistoryTypeId=
3 5
4  PostId=
5 50472
6  RevisionGUID=
7 e297a272-235c-418d-aadf-c2b30ba8cd73
8  CreationDate=
9 2008-09-08T19:30:07.000
10  UserId=
11 4549416
12  UserDisplayName=
13 malach
14  Comment=
15 added 31 characters in body
16  Text=
17 Considering the comment of wcm (top value = xfd), you can calculate it like this;&#xD;&#xA;&#xD;&#xA;    function IntToExcel(n: Integer); string;&#xD;&#xA;    begin&#xD;&#xA;       Result := '';&#xD;&#xA;       for i := 2 down to 0 do &#xD;&#xA;       begin&#xD;&#xA;          if ((n div 26^i)) &gt; 0) or (i = 0) then&#xD;&#xA;             Result := Result + Char(Ord('A')+(n div (26^i)) - IIF(i&gt;0;1;0));&#xD;&#xA;          n := n mod (26^i);&#xD;&#xA;       end;&#xD;&#xA;    end;&#xD;&#xA;&#xD;&#xA;There are 26 characters in the alphabet and we have a number system just like hex or binary, just with an unusual character set (A..Z), representing positionally the powers of 26: (26^2)(26^1)(26^0).
18  ContentLicense=
19 CC 